utf.c 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187
  1. /*
  2. * Copyright (c) 2009-2016 Petri Lehtinen <petri@digip.org>
  3. *
  4. * Jansson is free software; you can redistribute it and/or modify
  5. * it under the terms of the MIT license. See LICENSE for details.
  6. */
  7. #include <string.h>
  8. #include "utf.h"
  9. int utf8_encode(int32_t codepoint, char *buffer, size_t *size)
  10. {
  11. if(codepoint < 0)
  12. return -1;
  13. else if(codepoint < 0x80)
  14. {
  15. buffer[0] = (char)codepoint;
  16. *size = 1;
  17. }
  18. else if(codepoint < 0x800)
  19. {
  20. buffer[0] = 0xC0 + ((codepoint & 0x7C0) >> 6);
  21. buffer[1] = 0x80 + ((codepoint & 0x03F));
  22. *size = 2;
  23. }
  24. else if(codepoint < 0x10000)
  25. {
  26. buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12);
  27. buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6);
  28. buffer[2] = 0x80 + ((codepoint & 0x003F));
  29. *size = 3;
  30. }
  31. else if(codepoint <= 0x10FFFF)
  32. {
  33. buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18);
  34. buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12);
  35. buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6);
  36. buffer[3] = 0x80 + ((codepoint & 0x00003F));
  37. *size = 4;
  38. }
  39. else
  40. return -1;
  41. return 0;
  42. }
  43. size_t utf8_check_first(char byte)
  44. {
  45. unsigned char u = (unsigned char)byte;
  46. if(u < 0x80)
  47. return 1;
  48. if(0x80 <= u && u <= 0xBF) {
  49. /* second, third or fourth byte of a multi-byte
  50. sequence, i.e. a "continuation byte" */
  51. return 0;
  52. }
  53. else if(u == 0xC0 || u == 0xC1) {
  54. /* overlong encoding of an ASCII byte */
  55. return 0;
  56. }
  57. else if(0xC2 <= u && u <= 0xDF) {
  58. /* 2-byte sequence */
  59. return 2;
  60. }
  61. else if(0xE0 <= u && u <= 0xEF) {
  62. /* 3-byte sequence */
  63. return 3;
  64. }
  65. else if(0xF0 <= u && u <= 0xF4) {
  66. /* 4-byte sequence */
  67. return 4;
  68. }
  69. else { /* u >= 0xF5 */
  70. /* Restricted (start of 4-, 5- or 6-byte sequence) or invalid
  71. UTF-8 */
  72. return 0;
  73. }
  74. }
  75. size_t utf8_check_full(const char *buffer, size_t size, int32_t *codepoint)
  76. {
  77. size_t i;
  78. int32_t value = 0;
  79. unsigned char u = (unsigned char)buffer[0];
  80. if(size == 2)
  81. {
  82. value = u & 0x1F;
  83. }
  84. else if(size == 3)
  85. {
  86. value = u & 0xF;
  87. }
  88. else if(size == 4)
  89. {
  90. value = u & 0x7;
  91. }
  92. else
  93. return 0;
  94. for(i = 1; i < size; i++)
  95. {
  96. u = (unsigned char)buffer[i];
  97. if(u < 0x80 || u > 0xBF) {
  98. /* not a continuation byte */
  99. return 0;
  100. }
  101. value = (value << 6) + (u & 0x3F);
  102. }
  103. if(value > 0x10FFFF) {
  104. /* not in Unicode range */
  105. return 0;
  106. }
  107. else if(0xD800 <= value && value <= 0xDFFF) {
  108. /* invalid code point (UTF-16 surrogate halves) */
  109. return 0;
  110. }
  111. else if((size == 2 && value < 0x80) ||
  112. (size == 3 && value < 0x800) ||
  113. (size == 4 && value < 0x10000)) {
  114. /* overlong encoding */
  115. return 0;
  116. }
  117. if(codepoint)
  118. *codepoint = value;
  119. return 1;
  120. }
  121. const char *utf8_iterate(const char *buffer, size_t bufsize, int32_t *codepoint)
  122. {
  123. size_t count;
  124. int32_t value;
  125. if(!bufsize)
  126. return buffer;
  127. count = utf8_check_first(buffer[0]);
  128. if(count <= 0)
  129. return NULL;
  130. if(count == 1)
  131. value = (unsigned char)buffer[0];
  132. else
  133. {
  134. if(count > bufsize || !utf8_check_full(buffer, count, &value))
  135. return NULL;
  136. }
  137. if(codepoint)
  138. *codepoint = value;
  139. return buffer + count;
  140. }
  141. int utf8_check_string(const char *string, size_t length)
  142. {
  143. size_t i;
  144. for(i = 0; i < length; i++)
  145. {
  146. size_t count = utf8_check_first(string[i]);
  147. if(count == 0)
  148. return 0;
  149. else if(count > 1)
  150. {
  151. if(count > length - i)
  152. return 0;
  153. if(!utf8_check_full(&string[i], count, NULL))
  154. return 0;
  155. i += count - 1;
  156. }
  157. }
  158. return 1;
  159. }