punycode.js 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441
  1. 'use strict';
  2. /** Highest positive signed 32-bit float value */
  3. const maxInt = 2147483647; // aka. 0x7FFFFFFF or 2^31-1
  4. /** Bootstring parameters */
  5. const base = 36;
  6. const tMin = 1;
  7. const tMax = 26;
  8. const skew = 38;
  9. const damp = 700;
  10. const initialBias = 72;
  11. const initialN = 128; // 0x80
  12. const delimiter = '-'; // '\x2D'
  13. /** Regular expressions */
  14. const regexPunycode = /^xn--/;
  15. const regexNonASCII = /[^\0-\x7E]/; // non-ASCII chars
  16. const regexSeparators = /[\x2E\u3002\uFF0E\uFF61]/g; // RFC 3490 separators
  17. /** Error messages */
  18. const errors = {
  19. 'overflow': 'Overflow: input needs wider integers to process',
  20. 'not-basic': 'Illegal input >= 0x80 (not a basic code point)',
  21. 'invalid-input': 'Invalid input'
  22. };
  23. /** Convenience shortcuts */
  24. const baseMinusTMin = base - tMin;
  25. const floor = Math.floor;
  26. const stringFromCharCode = String.fromCharCode;
  27. /*--------------------------------------------------------------------------*/
  28. /**
  29. * A generic error utility function.
  30. * @private
  31. * @param {String} type The error type.
  32. * @returns {Error} Throws a `RangeError` with the applicable error message.
  33. */
  34. function error(type) {
  35. throw new RangeError(errors[type]);
  36. }
  37. /**
  38. * A generic `Array#map` utility function.
  39. * @private
  40. * @param {Array} array The array to iterate over.
  41. * @param {Function} callback The function that gets called for every array
  42. * item.
  43. * @returns {Array} A new array of values returned by the callback function.
  44. */
  45. function map(array, fn) {
  46. const result = [];
  47. let length = array.length;
  48. while (length--) {
  49. result[length] = fn(array[length]);
  50. }
  51. return result;
  52. }
  53. /**
  54. * A simple `Array#map`-like wrapper to work with domain name strings or email
  55. * addresses.
  56. * @private
  57. * @param {String} domain The domain name or email address.
  58. * @param {Function} callback The function that gets called for every
  59. * character.
  60. * @returns {Array} A new string of characters returned by the callback
  61. * function.
  62. */
  63. function mapDomain(string, fn) {
  64. const parts = string.split('@');
  65. let result = '';
  66. if (parts.length > 1) {
  67. // In email addresses, only the domain name should be punycoded. Leave
  68. // the local part (i.e. everything up to `@`) intact.
  69. result = parts[0] + '@';
  70. string = parts[1];
  71. }
  72. // Avoid `split(regex)` for IE8 compatibility. See #17.
  73. string = string.replace(regexSeparators, '\x2E');
  74. const labels = string.split('.');
  75. const encoded = map(labels, fn).join('.');
  76. return result + encoded;
  77. }
  78. /**
  79. * Creates an array containing the numeric code points of each Unicode
  80. * character in the string. While JavaScript uses UCS-2 internally,
  81. * this function will convert a pair of surrogate halves (each of which
  82. * UCS-2 exposes as separate characters) into a single code point,
  83. * matching UTF-16.
  84. * @see `punycode.ucs2.encode`
  85. * @see <https://mathiasbynens.be/notes/javascript-encoding>
  86. * @memberOf punycode.ucs2
  87. * @name decode
  88. * @param {String} string The Unicode input string (UCS-2).
  89. * @returns {Array} The new array of code points.
  90. */
  91. function ucs2decode(string) {
  92. const output = [];
  93. let counter = 0;
  94. const length = string.length;
  95. while (counter < length) {
  96. const value = string.charCodeAt(counter++);
  97. if (value >= 0xD800 && value <= 0xDBFF && counter < length) {
  98. // It's a high surrogate, and there is a next character.
  99. const extra = string.charCodeAt(counter++);
  100. if ((extra & 0xFC00) == 0xDC00) { // Low surrogate.
  101. output.push(((value & 0x3FF) << 10) + (extra & 0x3FF) + 0x10000);
  102. } else {
  103. // It's an unmatched surrogate; only append this code unit, in case the
  104. // next code unit is the high surrogate of a surrogate pair.
  105. output.push(value);
  106. counter--;
  107. }
  108. } else {
  109. output.push(value);
  110. }
  111. }
  112. return output;
  113. }
  114. /**
  115. * Creates a string based on an array of numeric code points.
  116. * @see `punycode.ucs2.decode`
  117. * @memberOf punycode.ucs2
  118. * @name encode
  119. * @param {Array} codePoints The array of numeric code points.
  120. * @returns {String} The new Unicode string (UCS-2).
  121. */
  122. const ucs2encode = array => String.fromCodePoint(...array);
  123. /**
  124. * Converts a basic code point into a digit/integer.
  125. * @see `digitToBasic()`
  126. * @private
  127. * @param {Number} codePoint The basic numeric code point value.
  128. * @returns {Number} The numeric value of a basic code point (for use in
  129. * representing integers) in the range `0` to `base - 1`, or `base` if
  130. * the code point does not represent a value.
  131. */
  132. const basicToDigit = function(codePoint) {
  133. if (codePoint - 0x30 < 0x0A) {
  134. return codePoint - 0x16;
  135. }
  136. if (codePoint - 0x41 < 0x1A) {
  137. return codePoint - 0x41;
  138. }
  139. if (codePoint - 0x61 < 0x1A) {
  140. return codePoint - 0x61;
  141. }
  142. return base;
  143. };
  144. /**
  145. * Converts a digit/integer into a basic code point.
  146. * @see `basicToDigit()`
  147. * @private
  148. * @param {Number} digit The numeric value of a basic code point.
  149. * @returns {Number} The basic code point whose value (when used for
  150. * representing integers) is `digit`, which needs to be in the range
  151. * `0` to `base - 1`. If `flag` is non-zero, the uppercase form is
  152. * used; else, the lowercase form is used. The behavior is undefined
  153. * if `flag` is non-zero and `digit` has no uppercase form.
  154. */
  155. const digitToBasic = function(digit, flag) {
  156. // 0..25 map to ASCII a..z or A..Z
  157. // 26..35 map to ASCII 0..9
  158. return digit + 22 + 75 * (digit < 26) - ((flag != 0) << 5);
  159. };
  160. /**
  161. * Bias adaptation function as per section 3.4 of RFC 3492.
  162. * https://tools.ietf.org/html/rfc3492#section-3.4
  163. * @private
  164. */
  165. const adapt = function(delta, numPoints, firstTime) {
  166. let k = 0;
  167. delta = firstTime ? floor(delta / damp) : delta >> 1;
  168. delta += floor(delta / numPoints);
  169. for (/* no initialization */; delta > baseMinusTMin * tMax >> 1; k += base) {
  170. delta = floor(delta / baseMinusTMin);
  171. }
  172. return floor(k + (baseMinusTMin + 1) * delta / (delta + skew));
  173. };
  174. /**
  175. * Converts a Punycode string of ASCII-only symbols to a string of Unicode
  176. * symbols.
  177. * @memberOf punycode
  178. * @param {String} input The Punycode string of ASCII-only symbols.
  179. * @returns {String} The resulting string of Unicode symbols.
  180. */
  181. const decode = function(input) {
  182. // Don't use UCS-2.
  183. const output = [];
  184. const inputLength = input.length;
  185. let i = 0;
  186. let n = initialN;
  187. let bias = initialBias;
  188. // Handle the basic code points: let `basic` be the number of input code
  189. // points before the last delimiter, or `0` if there is none, then copy
  190. // the first basic code points to the output.
  191. let basic = input.lastIndexOf(delimiter);
  192. if (basic < 0) {
  193. basic = 0;
  194. }
  195. for (let j = 0; j < basic; ++j) {
  196. // if it's not a basic code point
  197. if (input.charCodeAt(j) >= 0x80) {
  198. error('not-basic');
  199. }
  200. output.push(input.charCodeAt(j));
  201. }
  202. // Main decoding loop: start just after the last delimiter if any basic code
  203. // points were copied; start at the beginning otherwise.
  204. for (let index = basic > 0 ? basic + 1 : 0; index < inputLength; /* no final expression */) {
  205. // `index` is the index of the next character to be consumed.
  206. // Decode a generalized variable-length integer into `delta`,
  207. // which gets added to `i`. The overflow checking is easier
  208. // if we increase `i` as we go, then subtract off its starting
  209. // value at the end to obtain `delta`.
  210. let oldi = i;
  211. for (let w = 1, k = base; /* no condition */; k += base) {
  212. if (index >= inputLength) {
  213. error('invalid-input');
  214. }
  215. const digit = basicToDigit(input.charCodeAt(index++));
  216. if (digit >= base || digit > floor((maxInt - i) / w)) {
  217. error('overflow');
  218. }
  219. i += digit * w;
  220. const t = k <= bias ? tMin : (k >= bias + tMax ? tMax : k - bias);
  221. if (digit < t) {
  222. break;
  223. }
  224. const baseMinusT = base - t;
  225. if (w > floor(maxInt / baseMinusT)) {
  226. error('overflow');
  227. }
  228. w *= baseMinusT;
  229. }
  230. const out = output.length + 1;
  231. bias = adapt(i - oldi, out, oldi == 0);
  232. // `i` was supposed to wrap around from `out` to `0`,
  233. // incrementing `n` each time, so we'll fix that now:
  234. if (floor(i / out) > maxInt - n) {
  235. error('overflow');
  236. }
  237. n += floor(i / out);
  238. i %= out;
  239. // Insert `n` at position `i` of the output.
  240. output.splice(i++, 0, n);
  241. }
  242. return String.fromCodePoint(...output);
  243. };
  244. /**
  245. * Converts a string of Unicode symbols (e.g. a domain name label) to a
  246. * Punycode string of ASCII-only symbols.
  247. * @memberOf punycode
  248. * @param {String} input The string of Unicode symbols.
  249. * @returns {String} The resulting Punycode string of ASCII-only symbols.
  250. */
  251. const encode = function(input) {
  252. const output = [];
  253. // Convert the input in UCS-2 to an array of Unicode code points.
  254. input = ucs2decode(input);
  255. // Cache the length.
  256. let inputLength = input.length;
  257. // Initialize the state.
  258. let n = initialN;
  259. let delta = 0;
  260. let bias = initialBias;
  261. // Handle the basic code points.
  262. for (const currentValue of input) {
  263. if (currentValue < 0x80) {
  264. output.push(stringFromCharCode(currentValue));
  265. }
  266. }
  267. let basicLength = output.length;
  268. let handledCPCount = basicLength;
  269. // `handledCPCount` is the number of code points that have been handled;
  270. // `basicLength` is the number of basic code points.
  271. // Finish the basic string with a delimiter unless it's empty.
  272. if (basicLength) {
  273. output.push(delimiter);
  274. }
  275. // Main encoding loop:
  276. while (handledCPCount < inputLength) {
  277. // All non-basic code points < n have been handled already. Find the next
  278. // larger one:
  279. let m = maxInt;
  280. for (const currentValue of input) {
  281. if (currentValue >= n && currentValue < m) {
  282. m = currentValue;
  283. }
  284. }
  285. // Increase `delta` enough to advance the decoder's <n,i> state to <m,0>,
  286. // but guard against overflow.
  287. const handledCPCountPlusOne = handledCPCount + 1;
  288. if (m - n > floor((maxInt - delta) / handledCPCountPlusOne)) {
  289. error('overflow');
  290. }
  291. delta += (m - n) * handledCPCountPlusOne;
  292. n = m;
  293. for (const currentValue of input) {
  294. if (currentValue < n && ++delta > maxInt) {
  295. error('overflow');
  296. }
  297. if (currentValue == n) {
  298. // Represent delta as a generalized variable-length integer.
  299. let q = delta;
  300. for (let k = base; /* no condition */; k += base) {
  301. const t = k <= bias ? tMin : (k >= bias + tMax ? tMax : k - bias);
  302. if (q < t) {
  303. break;
  304. }
  305. const qMinusT = q - t;
  306. const baseMinusT = base - t;
  307. output.push(
  308. stringFromCharCode(digitToBasic(t + qMinusT % baseMinusT, 0))
  309. );
  310. q = floor(qMinusT / baseMinusT);
  311. }
  312. output.push(stringFromCharCode(digitToBasic(q, 0)));
  313. bias = adapt(delta, handledCPCountPlusOne, handledCPCount == basicLength);
  314. delta = 0;
  315. ++handledCPCount;
  316. }
  317. }
  318. ++delta;
  319. ++n;
  320. }
  321. return output.join('');
  322. };
  323. /**
  324. * Converts a Punycode string representing a domain name or an email address
  325. * to Unicode. Only the Punycoded parts of the input will be converted, i.e.
  326. * it doesn't matter if you call it on a string that has already been
  327. * converted to Unicode.
  328. * @memberOf punycode
  329. * @param {String} input The Punycoded domain name or email address to
  330. * convert to Unicode.
  331. * @returns {String} The Unicode representation of the given Punycode
  332. * string.
  333. */
  334. const toUnicode = function(input) {
  335. return mapDomain(input, function(string) {
  336. return regexPunycode.test(string)
  337. ? decode(string.slice(4).toLowerCase())
  338. : string;
  339. });
  340. };
  341. /**
  342. * Converts a Unicode string representing a domain name or an email address to
  343. * Punycode. Only the non-ASCII parts of the domain name will be converted,
  344. * i.e. it doesn't matter if you call it with a domain that's already in
  345. * ASCII.
  346. * @memberOf punycode
  347. * @param {String} input The domain name or email address to convert, as a
  348. * Unicode string.
  349. * @returns {String} The Punycode representation of the given domain name or
  350. * email address.
  351. */
  352. const toASCII = function(input) {
  353. return mapDomain(input, function(string) {
  354. return regexNonASCII.test(string)
  355. ? 'xn--' + encode(string)
  356. : string;
  357. });
  358. };
  359. /*--------------------------------------------------------------------------*/
  360. /** Define the public API */
  361. const punycode = {
  362. /**
  363. * A string representing the current Punycode.js version number.
  364. * @memberOf punycode
  365. * @type String
  366. */
  367. 'version': '2.1.0',
  368. /**
  369. * An object of methods to convert from JavaScript's internal character
  370. * representation (UCS-2) to Unicode code points, and back.
  371. * @see <https://mathiasbynens.be/notes/javascript-encoding>
  372. * @memberOf punycode
  373. * @type Object
  374. */
  375. 'ucs2': {
  376. 'decode': ucs2decode,
  377. 'encode': ucs2encode
  378. },
  379. 'decode': decode,
  380. 'encode': encode,
  381. 'toASCII': toASCII,
  382. 'toUnicode': toUnicode
  383. };
  384. module.exports = punycode;