no-misleading-character-class.js 16 KB


  1. /**
  2. * @author Toru Nagashima <https://github.com/mysticatea>
  3. */
  4. "use strict";
  5. const {
  6. CALL,
  7. CONSTRUCT,
  8. ReferenceTracker,
  9. getStaticValue,
  10. getStringIfConstant,
  11. } = require("@eslint-community/eslint-utils");
  12. const { RegExpParser, visitRegExpAST } = require("@eslint-community/regexpp");
  13. const {
  14. isCombiningCharacter,
  15. isEmojiModifier,
  16. isRegionalIndicatorSymbol,
  17. isSurrogatePair,
  18. } = require("./utils/unicode");
  19. const astUtils = require("./utils/ast-utils.js");
  20. const { isValidWithUnicodeFlag } = require("./utils/regular-expressions");
  21. const {
  22. parseStringLiteral,
  23. parseTemplateToken,
  24. } = require("./utils/char-source");
  25. //------------------------------------------------------------------------------
  26. // Helpers
  27. //------------------------------------------------------------------------------
  28. /**
  29. * @typedef {import('@eslint-community/regexpp').AST.Character} Character
  30. * @typedef {import('@eslint-community/regexpp').AST.CharacterClassElement} CharacterClassElement
  31. */
  32. /**
  33. * Iterate character sequences of a given nodes.
  34. *
  35. * CharacterClassRange syntax can steal a part of character sequence,
  36. * so this function reverts CharacterClassRange syntax and restore the sequence.
  37. * @param {CharacterClassElement[]} nodes The node list to iterate character sequences.
  38. * @returns {IterableIterator<Character[]>} The list of character sequences.
  39. */
  40. function* iterateCharacterSequence(nodes) {
  41. /** @type {Character[]} */
  42. let seq = [];
  43. for (const node of nodes) {
  44. switch (node.type) {
  45. case "Character":
  46. seq.push(node);
  47. break;
  48. case "CharacterClassRange":
  49. seq.push(node.min);
  50. yield seq;
  51. seq = [node.max];
  52. break;
  53. case "CharacterSet":
  54. case "CharacterClass": // [[]] nesting character class
  55. case "ClassStringDisjunction": // \q{...}
  56. case "ExpressionCharacterClass": // [A--B]
  57. if (seq.length > 0) {
  58. yield seq;
  59. seq = [];
  60. }
  61. break;
  62. // no default
  63. }
  64. }
  65. if (seq.length > 0) {
  66. yield seq;
  67. }
  68. }
  69. /**
  70. * Checks whether the given character node is a Unicode code point escape or not.
  71. * @param {Character} char the character node to check.
  72. * @returns {boolean} `true` if the character node is a Unicode code point escape.
  73. */
  74. function isUnicodeCodePointEscape(char) {
  75. return /^\\u\{[\da-f]+\}$/iu.test(char.raw);
  76. }
  77. /**
  78. * Each function returns matched characters if it detects that kind of problem.
  79. * @type {Record<string, (chars: Character[]) => IterableIterator<Character[]>>}
  80. */
  81. const findCharacterSequences = {
  82. *surrogatePairWithoutUFlag(chars) {
  83. for (const [index, char] of chars.entries()) {
  84. const previous = chars[index - 1];
  85. if (
  86. previous &&
  87. char &&
  88. isSurrogatePair(previous.value, char.value) &&
  89. !isUnicodeCodePointEscape(previous) &&
  90. !isUnicodeCodePointEscape(char)
  91. ) {
  92. yield [previous, char];
  93. }
  94. }
  95. },
  96. *surrogatePair(chars) {
  97. for (const [index, char] of chars.entries()) {
  98. const previous = chars[index - 1];
  99. if (
  100. previous &&
  101. char &&
  102. isSurrogatePair(previous.value, char.value) &&
  103. (isUnicodeCodePointEscape(previous) ||
  104. isUnicodeCodePointEscape(char))
  105. ) {
  106. yield [previous, char];
  107. }
  108. }
  109. },
  110. *combiningClass(chars, unfilteredChars) {
  111. /*
  112. * When `allowEscape` is `true`, a combined character should only be allowed if the combining mark appears as an escape sequence.
  113. * This means that the base character should be considered even if it's escaped.
  114. */
  115. for (const [index, char] of chars.entries()) {
  116. const previous = unfilteredChars[index - 1];
  117. if (
  118. previous &&
  119. char &&
  120. isCombiningCharacter(char.value) &&
  121. !isCombiningCharacter(previous.value)
  122. ) {
  123. yield [previous, char];
  124. }
  125. }
  126. },
  127. *emojiModifier(chars) {
  128. for (const [index, char] of chars.entries()) {
  129. const previous = chars[index - 1];
  130. if (
  131. previous &&
  132. char &&
  133. isEmojiModifier(char.value) &&
  134. !isEmojiModifier(previous.value)
  135. ) {
  136. yield [previous, char];
  137. }
  138. }
  139. },
  140. *regionalIndicatorSymbol(chars) {
  141. for (const [index, char] of chars.entries()) {
  142. const previous = chars[index - 1];
  143. if (
  144. previous &&
  145. char &&
  146. isRegionalIndicatorSymbol(char.value) &&
  147. isRegionalIndicatorSymbol(previous.value)
  148. ) {
  149. yield [previous, char];
  150. }
  151. }
  152. },
  153. *zwj(chars) {
  154. let sequence = null;
  155. for (const [index, char] of chars.entries()) {
  156. const previous = chars[index - 1];
  157. const next = chars[index + 1];
  158. if (
  159. previous &&
  160. char &&
  161. next &&
  162. char.value === 0x200d &&
  163. previous.value !== 0x200d &&
  164. next.value !== 0x200d
  165. ) {
  166. if (sequence) {
  167. if (sequence.at(-1) === previous) {
  168. sequence.push(char, next); // append to the sequence
  169. } else {
  170. yield sequence;
  171. sequence = chars.slice(index - 1, index + 2);
  172. }
  173. } else {
  174. sequence = chars.slice(index - 1, index + 2);
  175. }
  176. }
  177. }
  178. if (sequence) {
  179. yield sequence;
  180. }
  181. },
  182. };
  183. const kinds = Object.keys(findCharacterSequences);
  184. /**
  185. * Gets the value of the given node if it's a static value other than a regular expression object,
  186. * or the node's `regex` property.
  187. * The purpose of this method is to provide a replacement for `getStaticValue` in environments where certain regular expressions cannot be evaluated.
  188. * A known example is Node.js 18 which does not support the `v` flag.
  189. * Calling `getStaticValue` on a regular expression node with the `v` flag on Node.js 18 always returns `null`.
  190. * A limitation of this method is that it can only detect a regular expression if the specified node is itself a regular expression literal node.
  191. * @param {ASTNode | undefined} node The node to be inspected.
  192. * @param {Scope} initialScope Scope to start finding variables. This function tries to resolve identifier references which are in the given scope.
  193. * @returns {{ value: any } | { regex: { pattern: string, flags: string } } | null} The static value of the node, or `null`.
  194. */
  195. function getStaticValueOrRegex(node, initialScope) {
  196. if (!node) {
  197. return null;
  198. }
  199. if (node.type === "Literal" && node.regex) {
  200. return { regex: node.regex };
  201. }
  202. const staticValue = getStaticValue(node, initialScope);
  203. if (staticValue?.value instanceof RegExp) {
  204. return null;
  205. }
  206. return staticValue;
  207. }
  208. /**
  209. * Checks whether a specified regexpp character is represented as an acceptable escape sequence.
  210. * This function requires the source text of the character to be known.
  211. * @param {Character} char Character to check.
  212. * @param {string} charSource Source text of the character to check.
  213. * @returns {boolean} Whether the specified regexpp character is represented as an acceptable escape sequence.
  214. */
  215. function checkForAcceptableEscape(char, charSource) {
  216. if (!charSource.startsWith("\\")) {
  217. return false;
  218. }
  219. const match = /(?<=^\\+).$/su.exec(charSource);
  220. return match?.[0] !== String.fromCodePoint(char.value);
  221. }
  222. /**
  223. * Checks whether a specified regexpp character is represented as an acceptable escape sequence.
  224. * This function works with characters that are produced by a string or template literal.
  225. * It requires the source text and the CodeUnit list of the literal to be known.
  226. * @param {Character} char Character to check.
  227. * @param {string} nodeSource Source text of the string or template literal that produces the character.
  228. * @param {CodeUnit[]} codeUnits List of CodeUnit objects of the literal that produces the character.
  229. * @returns {boolean} Whether the specified regexpp character is represented as an acceptable escape sequence.
  230. */
  231. function checkForAcceptableEscapeInString(char, nodeSource, codeUnits) {
  232. const firstIndex = char.start;
  233. const lastIndex = char.end - 1;
  234. const start = codeUnits[firstIndex].start;
  235. const end = codeUnits[lastIndex].end;
  236. const charSource = nodeSource.slice(start, end);
  237. return checkForAcceptableEscape(char, charSource);
  238. }
  239. //------------------------------------------------------------------------------
  240. // Rule Definition
  241. //------------------------------------------------------------------------------
  242. /** @type {import('../types').Rule.RuleModule} */
  243. module.exports = {
  244. meta: {
  245. type: "problem",
  246. defaultOptions: [
  247. {
  248. allowEscape: false,
  249. },
  250. ],
  251. docs: {
  252. description:
  253. "Disallow characters which are made with multiple code points in character class syntax",
  254. recommended: true,
  255. url: "https://eslint.org/docs/latest/rules/no-misleading-character-class",
  256. },
  257. hasSuggestions: true,
  258. schema: [
  259. {
  260. type: "object",
  261. properties: {
  262. allowEscape: {
  263. type: "boolean",
  264. },
  265. },
  266. additionalProperties: false,
  267. },
  268. ],
  269. messages: {
  270. surrogatePairWithoutUFlag:
  271. "Unexpected surrogate pair in character class. Use 'u' flag.",
  272. surrogatePair: "Unexpected surrogate pair in character class.",
  273. combiningClass: "Unexpected combined character in character class.",
  274. emojiModifier: "Unexpected modified Emoji in character class.",
  275. regionalIndicatorSymbol:
  276. "Unexpected national flag in character class.",
  277. zwj: "Unexpected joined character sequence in character class.",
  278. suggestUnicodeFlag: "Add unicode 'u' flag to regex.",
  279. },
  280. },
  281. create(context) {
  282. const [{ allowEscape }] = context.options;
  283. const sourceCode = context.sourceCode;
  284. const parser = new RegExpParser();
  285. const checkedPatternNodes = new Set();
  286. /**
  287. * Verify a given regular expression.
  288. * @param {Node} node The node to report.
  289. * @param {string} pattern The regular expression pattern to verify.
  290. * @param {string} flags The flags of the regular expression.
  291. * @param {Function} unicodeFixer Fixer for missing "u" flag.
  292. * @returns {void}
  293. */
  294. function verify(node, pattern, flags, unicodeFixer) {
  295. let patternNode;
  296. try {
  297. patternNode = parser.parsePattern(pattern, 0, pattern.length, {
  298. unicode: flags.includes("u"),
  299. unicodeSets: flags.includes("v"),
  300. });
  301. } catch {
  302. // Ignore regular expressions with syntax errors
  303. return;
  304. }
  305. let codeUnits = null;
  306. /**
  307. * Checks whether a specified regexpp character is represented as an acceptable escape sequence.
  308. * For the purposes of this rule, an escape sequence is considered acceptable if it consists of one or more backslashes followed by the character being escaped.
  309. * @param {Character} char Character to check.
  310. * @returns {boolean} Whether the specified regexpp character is represented as an acceptable escape sequence.
  311. */
  312. function isAcceptableEscapeSequence(char) {
  313. if (node.type === "Literal" && node.regex) {
  314. return checkForAcceptableEscape(char, char.raw);
  315. }
  316. if (node.type === "Literal" && typeof node.value === "string") {
  317. const nodeSource = node.raw;
  318. codeUnits ??= parseStringLiteral(nodeSource);
  319. return checkForAcceptableEscapeInString(
  320. char,
  321. nodeSource,
  322. codeUnits,
  323. );
  324. }
  325. if (astUtils.isStaticTemplateLiteral(node)) {
  326. const nodeSource = sourceCode.getText(node);
  327. codeUnits ??= parseTemplateToken(nodeSource);
  328. return checkForAcceptableEscapeInString(
  329. char,
  330. nodeSource,
  331. codeUnits,
  332. );
  333. }
  334. return false;
  335. }
  336. const foundKindMatches = new Map();
  337. visitRegExpAST(patternNode, {
  338. onCharacterClassEnter(ccNode) {
  339. for (const unfilteredChars of iterateCharacterSequence(
  340. ccNode.elements,
  341. )) {
  342. let chars;
  343. if (allowEscape) {
  344. // Replace escape sequences with null to avoid having them flagged.
  345. chars = unfilteredChars.map(char =>
  346. isAcceptableEscapeSequence(char) ? null : char,
  347. );
  348. } else {
  349. chars = unfilteredChars;
  350. }
  351. for (const kind of kinds) {
  352. const matches = findCharacterSequences[kind](
  353. chars,
  354. unfilteredChars,
  355. );
  356. if (foundKindMatches.has(kind)) {
  357. foundKindMatches.get(kind).push(...matches);
  358. } else {
  359. foundKindMatches.set(kind, [...matches]);
  360. }
  361. }
  362. }
  363. },
  364. });
  365. /**
  366. * Finds the report loc(s) for a range of matches.
  367. * Only literals and expression-less templates generate granular errors.
  368. * @param {Character[][]} matches Lists of individual characters being reported on.
  369. * @returns {Location[]} locs for context.report.
  370. * @see https://github.com/eslint/eslint/pull/17515
  371. */
  372. function getNodeReportLocations(matches) {
  373. if (
  374. !astUtils.isStaticTemplateLiteral(node) &&
  375. node.type !== "Literal"
  376. ) {
  377. return matches.length ? [node.loc] : [];
  378. }
  379. return matches.map(chars => {
  380. const firstIndex = chars[0].start;
  381. const lastIndex = chars.at(-1).end - 1;
  382. let start;
  383. let end;
  384. if (node.type === "TemplateLiteral") {
  385. const source = sourceCode.getText(node);
  386. const offset = node.range[0];
  387. codeUnits ??= parseTemplateToken(source);
  388. start = offset + codeUnits[firstIndex].start;
  389. end = offset + codeUnits[lastIndex].end;
  390. } else if (typeof node.value === "string") {
  391. // String Literal
  392. const source = node.raw;
  393. const offset = node.range[0];
  394. codeUnits ??= parseStringLiteral(source);
  395. start = offset + codeUnits[firstIndex].start;
  396. end = offset + codeUnits[lastIndex].end;
  397. } else {
  398. // RegExp Literal
  399. const offset = node.range[0] + 1; // Add 1 to skip the leading slash.
  400. start = offset + firstIndex;
  401. end = offset + lastIndex + 1;
  402. }
  403. return {
  404. start: sourceCode.getLocFromIndex(start),
  405. end: sourceCode.getLocFromIndex(end),
  406. };
  407. });
  408. }
  409. for (const [kind, matches] of foundKindMatches) {
  410. let suggest;
  411. if (kind === "surrogatePairWithoutUFlag") {
  412. suggest = [
  413. {
  414. messageId: "suggestUnicodeFlag",
  415. fix: unicodeFixer,
  416. },
  417. ];
  418. }
  419. const locs = getNodeReportLocations(matches);
  420. for (const loc of locs) {
  421. context.report({
  422. node,
  423. loc,
  424. messageId: kind,
  425. suggest,
  426. });
  427. }
  428. }
  429. }
  430. return {
  431. "Literal[regex]"(node) {
  432. if (checkedPatternNodes.has(node)) {
  433. return;
  434. }
  435. verify(node, node.regex.pattern, node.regex.flags, fixer => {
  436. if (
  437. !isValidWithUnicodeFlag(
  438. context.languageOptions.ecmaVersion,
  439. node.regex.pattern,
  440. )
  441. ) {
  442. return null;
  443. }
  444. return fixer.insertTextAfter(node, "u");
  445. });
  446. },
  447. Program(node) {
  448. const scope = sourceCode.getScope(node);
  449. const tracker = new ReferenceTracker(scope);
  450. /*
  451. * Iterate calls of RegExp.
  452. * E.g., `new RegExp()`, `RegExp()`, `new window.RegExp()`,
  453. * `const {RegExp: a} = window; new a()`, etc...
  454. */
  455. for (const { node: refNode } of tracker.iterateGlobalReferences(
  456. {
  457. RegExp: { [CALL]: true, [CONSTRUCT]: true },
  458. },
  459. )) {
  460. let pattern, flags;
  461. const [patternNode, flagsNode] = refNode.arguments;
  462. const evaluatedPattern = getStaticValueOrRegex(
  463. patternNode,
  464. scope,
  465. );
  466. if (!evaluatedPattern) {
  467. continue;
  468. }
  469. if (flagsNode) {
  470. if (evaluatedPattern.regex) {
  471. pattern = evaluatedPattern.regex.pattern;
  472. checkedPatternNodes.add(patternNode);
  473. } else {
  474. pattern = String(evaluatedPattern.value);
  475. }
  476. flags = getStringIfConstant(flagsNode, scope);
  477. } else {
  478. if (evaluatedPattern.regex) {
  479. continue;
  480. }
  481. pattern = String(evaluatedPattern.value);
  482. flags = "";
  483. }
  484. if (typeof flags === "string") {
  485. verify(patternNode, pattern, flags, fixer => {
  486. if (
  487. !isValidWithUnicodeFlag(
  488. context.languageOptions.ecmaVersion,
  489. pattern,
  490. )
  491. ) {
  492. return null;
  493. }
  494. if (refNode.arguments.length === 1) {
  495. const penultimateToken =
  496. sourceCode.getLastToken(refNode, {
  497. skip: 1,
  498. }); // skip closing parenthesis
  499. return fixer.insertTextAfter(
  500. penultimateToken,
  501. astUtils.isCommaToken(penultimateToken)
  502. ? ' "u",'
  503. : ', "u"',
  504. );
  505. }
  506. if (
  507. (flagsNode.type === "Literal" &&
  508. typeof flagsNode.value === "string") ||
  509. flagsNode.type === "TemplateLiteral"
  510. ) {
  511. const range = [
  512. flagsNode.range[0],
  513. flagsNode.range[1] - 1,
  514. ];
  515. return fixer.insertTextAfterRange(range, "u");
  516. }
  517. return null;
  518. });
  519. }
  520. }
  521. },
  522. };
  523. },
  524. };