深入理解正则表达式

完整语法、命名捕获组、断言、回溯原理、ReDoS 防御与 ES2024+ 新特性

什么是正则表达式？

定义：正则表达式（Regular Expression, RegExp）是一种用于描述字符串匹配模式的微型语言。JavaScript 通过内置的 RegExp 对象和字符串方法（match、replace、search、split）提供正则支持，可以高效地进行字符串搜索、提取、验证和替换。

涉及场景：

表单验证：邮箱、手机号、身份证号、密码强度等格式校验
文本搜索与替换：编辑器的查找替换、代码中的字符串处理
数据提取：从 HTML/日志/URL 中提取关键信息（命名捕获组）
输入过滤：过滤特殊字符、XSS 防御中的标签清理
路由匹配：前端路由框架中路径参数的解析（如 /user/:id）
代码转换：Babel 插件、ESLint 规则中的 AST 之外的文本处理

作用：

高效模式匹配：一行正则替代数十行手写循环判断
通用技能：正则语法在几乎所有编程语言和工具中通用
性能敏感：理解回溯原理和 ReDoS 攻击，避免灾难性回溯
ES 新特性加持：命名捕获组、matchAll、/v 模式等持续增强能力

正则基础语法速查

字符类

javascript

.       // 匹配任意字符（除换行符，dotAll模式下匹配所有）
\d      // 数字 [0-9]
\D      // 非数字 [^0-9]
\w      // 单词字符 [a-zA-Z0-9_]
\W      // 非单词字符
\s      // 空白字符（空格、Tab、换行等）
\S      // 非空白字符
\b      // 单词边界
\B      // 非单词边界

// 字符集合
[abc]   // a 或 b 或 c
[^abc]  // 非 a/b/c
[a-z]   // a 到 z
[a-zA-Z0-9]  // 字母和数字

量词

javascript

*       // 0次或多次（贪婪）
+       // 1次或多次（贪婪）
?       // 0次或1次
{n}     // 恰好n次
{n,}    // 至少n次
{n,m}   // n到m次

// 非贪婪（惰性）：加 ?
*?      // 0次或多次（非贪婪）
+?      // 1次或多次（非贪婪）
??      // 0次或1次（非贪婪）
{n,m}?  // n到m次（非贪婪）

// 示例
'aabbb'.match(/a+/);   // 'aa'（贪婪，尽可能多）
'aabbb'.match(/a+?/);  // 'a'（非贪婪，尽可能少）

// 贪婪 vs 非贪婪
'<div>hello</div><div>world</div>'.match(/<div>.*<\/div>/);
// '<div>hello</div><div>world</div>'（贪婪，匹配到最后一个</div>）

'<div>hello</div><div>world</div>'.match(/<div>.*?<\/div>/);
// '<div>hello</div>'（非贪婪，匹配到第一个</div>）

分组与引用

javascript

// 捕获组
/(foo)(bar)/.exec('foobar');
// ['foobar', 'foo', 'bar']
// $1 = 'foo', $2 = 'bar'

// 非捕获组（不记录，性能更好）
/(?:foo)(bar)/.exec('foobar');
// ['foobar', 'bar']（没有 foo 的捕获）

// 命名捕获组（ES2018）
/(?<year>\d{4})-(?<month>\d{2})-(?<day>\d{2})/.exec('2026-03-25');
// groups: { year: '2026', month: '03', day: '25' }

// 反向引用
/(["']).*?\1/.test('"hello"');  // true（\1 引用第一个捕获组的内容）
/(["']).*?\1/.test('"hello\''); // false（引号不匹配）

// 命名反向引用
/(?<quote>["']).*?\k<quote>/.test('"hello"'); // true

标志（Flags）

javascript

/pattern/g   // global：全局匹配（不在第一个匹配后停止）
/pattern/i   // ignoreCase：不区分大小写
/pattern/m   // multiline：^ 和 $ 匹配每一行的开头和结尾
/pattern/s   // dotAll：. 匹配包括换行符在内的所有字符（ES2018）
/pattern/u   // unicode：启用完整的 Unicode 支持
/pattern/y   // sticky：从 lastIndex 位置开始匹配
/pattern/d   // hasIndices：返回匹配的索引信息（ES2022）
/pattern/v   // unicodeSets：Unicode 集合模式（ES2024）

断言（Assertions）

前瞻断言（Lookahead）

javascript

// 正向前瞻 (?=...)：后面必须跟着...
/\d+(?=px)/.exec('100px 200em');
// '100'（匹配后面跟着 px 的数字）

// 负向前瞻 (?!...)：后面不能跟着...
/\d+(?!px)/.exec('100px 200em');
// '10'（匹配后面不跟 px 的数字）← 注意是 '10' 不是 '200'

// 更精确
/\b\d+(?!px)\b/.exec('100px 200em');
// 需要 \b 单词边界来正确匹配

后瞻断言（Lookbehind，ES2018）

javascript

// 正向后瞻 (?<=...)：前面必须是...
/(?<=\$)\d+/.exec('$100 ¥200');
// '100'（匹配前面是 $ 的数字）

// 负向后瞻 (?<!...)：前面不能是...
/(?<!\$)\d+/.exec('$100 ¥200');
// '00'（匹配前面不是 $ 的数字）

// 实际应用：密码强度校验
const strongPassword = /^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[!@#$%^&*]).{8,}$/;
// (?=.*[a-z])  至少一个小写字母
// (?=.*[A-Z])  至少一个大写字母
// (?=.*\d)     至少一个数字
// (?=.*[!@#$]) 至少一个特殊字符
// .{8,}        总长度至少8

Unicode 支持

u 标志

javascript

// 不加 u，\u{} 语法不工作
/\u{61}/.test('a');   // false（被当作 u 重复61次）
/\u{61}/u.test('a');  // true（Unicode码点）

// emoji 等多字节字符
'😀'.length;              // 2（JS 使用 UTF-16）
/^.$/u.test('😀');         // true（u 标志正确处理）
/^.$/.test('😀');          // false（不加 u，. 只匹配一个码元）

// Unicode 属性转义（\p{} / \P{}）
/\p{Script=Han}/u.test('中');     // true（汉字）
/\p{Script=Latin}/u.test('a');    // true（拉丁字母）
/\p{Emoji}/u.test('😀');           // true
/\p{Number}/u.test('①');          // true
/\P{Number}/u.test('a');          // true（非数字）

v 标志（ES2024，Unicode Sets）

javascript

// v 标志是 u 的升级版
// 支持集合操作：交集 &&、差集 --、嵌套字符类

// 交集：同时满足两个条件
/[\p{Script=Greek}&&\p{Letter}]/v.test('α'); // true

// 差集：属于A但不属于B
/[\p{Decimal_Number}--[0-9]]/v.test('٣');    // true（阿拉伯数字3）

// 嵌套字符类
/[[a-z]--[aeiou]]/v;  // 辅音字母（小写字母减去元音）

// 字符串属性
/\p{Basic_Emoji}/v.test('😀');   // true
/^\p{RGI_Emoji}$/v.test('👨‍👩‍👧‍👦'); // true（家庭emoji）

常用正则模式

javascript

// 邮箱（简化版）
const email = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/;

// 手机号（中国大陆）
const phone = /^1[3-9]\d{9}$/;

// URL
const url = /^https?:\/\/([\w-]+\.)+[\w-]+(\/[\w\-./?%&=]*)?$/;

// IPv4
const ipv4 = /^((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)$/;

// 中文
const chinese = /[\u4e00-\u9fa5]/;  // 传统写法
const chineseV = /\p{Script=Han}/u;  // Unicode 属性（推荐）

// 千分位格式化
function formatNumber(num) {
  return num.toString().replace(/\B(?=(\d{3})+(?!\d))/g, ',');
}
formatNumber(1234567890); // '1,234,567,890'

// 驼峰转换
function camelToKebab(str) {
  return str.replace(/[A-Z]/g, m => '-' + m.toLowerCase());
}
function kebabToCamel(str) {
  return str.replace(/-([a-z])/g, (_, c) => c.toUpperCase());
}

// HTML 标签提取
const htmlTag = /<(?<tag>\w+)(?:\s+[^>]*)?>(?<content>[\s\S]*?)<\/\k<tag>>/g;

// 模板字符串解析
const template = /\{\{(\w+)\}\}/g;
function render(tpl, data) {
  return tpl.replace(template, (_, key) => data[key] ?? '');
}
render('Hello {{name}}, age {{age}}', { name: '张三', age: 25 });
// 'Hello 张三, age 25'

String 和 RegExp 的方法

javascript

const str = 'hello world hello';
const re = /hello/g;

// String 方法
str.match(re);              // ['hello', 'hello']（g标志：所有匹配）
str.match(/(\w+) (\w+)/);  // ['hello world', 'hello', 'world']（无g：含捕获组）

str.matchAll(/hello/g);     // 迭代器，每个元素含 index、groups（ES2020）
[...str.matchAll(/(\w+)/g)]; // 所有匹配的详细信息

str.search(/world/);        // 6（第一个匹配的索引，未找到返回 -1）
str.replace(/hello/g, 'hi'); // 'hi world hi'
str.replaceAll('hello', 'hi'); // 'hi world hi'（ES2021）
str.split(/\s+/);           // ['hello', 'world', 'hello']

// replace 的特殊替换模式
'2026-03-25'.replace(/(\d{4})-(\d{2})-(\d{2})/, '$2/$3/$1');
// '03/25/2026'

// replace 回调函数
'hello'.replace(/./g, (match, offset) => {
  return offset === 0 ? match.toUpperCase() : match;
});
// 'Hello'

// RegExp 方法
re.test(str);               // true
re.exec(str);               // 逐次返回匹配（配合 g/y 标志）

// exec 循环
const re2 = /\d+/g;
const text = 'a1 b22 c333';
let match;
while ((match = re2.exec(text)) !== null) {
  console.log(`${match[0]} at index ${match.index}`);
}
// '1' at index 1
// '22' at index 4
// '333' at index 8

回溯原理与 ReDoS

NFA 引擎的回溯

javascript

// JavaScript 使用 NFA（非确定性有限自动机）引擎
// 遇到多种可能时会逐一尝试，失败则回溯

// 示例：/a*b/ 匹配 "aaac"
// 尝试1: a* 匹配 "aaa"，b 匹配 "c" → 失败
// 回溯: a* 匹配 "aa"，b 匹配 "a" → 失败
// 回溯: a* 匹配 "a"，b 匹配 "a" → 失败
// 回溯: a* 匹配 ""，b 匹配 "a" → 失败
// 结果: 不匹配（经过4次回溯）

ReDoS（正则拒绝服务攻击）

javascript

// ❌ 灾难性回溯（Catastrophic Backtracking）
const evilRegex = /^(a+)+$/;
// 对于输入 "aaaaaaaaaaaaaaaaX"
// a+ 的每种分组方式都要尝试：
// (aaaa...)(a) / (aaa...)(aa) / (aaa...)(a)(a) / ...
// 时间复杂度 O(2^n)

console.time('evil');
evilRegex.test('a'.repeat(25) + 'X'); // 数秒甚至更久！
console.timeEnd('evil');

// 其他危险模式
/(a|a)+$/           // 重复的交替
/(a+)*$/            // 嵌套量词
/(\w+\s*)+$/        // 量词嵌套
/(.*a){20}/         // 过多的重复

// ✅ 安全改写
/^a+$/              // 移除嵌套
/^[a]+$/            // 使用字符类替代交替

// 防御策略
// 1. 避免嵌套量词
// 2. 使用原子组（JS不支持，但可以用其他方式模拟）
// 3. 限制输入长度
// 4. 使用 re2 库（线性时间引擎）
// 5. 设置超时
function safeRegexTest(pattern, str, timeout = 1000) {
  // 在 Worker 中执行，设置超时
  return new Promise((resolve, reject) => {
    const worker = new Worker(URL.createObjectURL(new Blob([`
      postMessage(${pattern}.test(${JSON.stringify(str)}));
    `])));
    const timer = setTimeout(() => {
      worker.terminate();
      reject(new Error('Regex timeout'));
    }, timeout);
    worker.onmessage = (e) => {
      clearTimeout(timer);
      resolve(e.data);
    };
  });
}

ES2022+ 正则新特性

hasIndices（d 标志，ES2022）

javascript

const re = /(?<year>\d{4})-(?<month>\d{2})/d;
const match = re.exec('Date: 2026-03');

// indices 属性包含每个捕获组的起止索引
console.log(match.indices);
// [[6, 13], [6, 10], [11, 13]]
// 完整匹配: [6, 13]  → '2026-03'
// year:     [6, 10]  → '2026'
// month:    [11, 13] → '03'

console.log(match.indices.groups);
// { year: [6, 10], month: [11, 13] }

总结

正则表达式核心知识点：
┌──────────────────────────────────────────────────────────┐
│ 基础语法                                                  │
│ • 字符类、量词（贪婪/非贪婪）、分组、反向引用                 │
│ • 标志：g/i/m/s/u/y/d/v                                  │
├──────────────────────────────────────────────────────────┤
│ 高级特性                                                  │
│ • 命名捕获组 (?<name>...)                                 │
│ • 前瞻/后瞻断言 (?=) (?!) (?<=) (?<!)                     │
│ • Unicode 属性 \p{Script=Han}                             │
│ • v 标志集合操作 && --（ES2024）                           │
│ • hasIndices d 标志（ES2022）                              │
├──────────────────────────────────────────────────────────┤
│ 性能与安全                                                │
│ • 回溯原理：NFA 引擎逐一尝试                                │
│ • ReDoS：嵌套量词导致灾难性回溯                             │
│ • 防御：避免嵌套量词、限制输入长度、使用 re2                  │
├──────────────────────────────────────────────────────────┤
│ 常用模式                                                  │
│ • 邮箱、手机号、URL、IP、中文、千分位格式化                   │
│ • String: match/matchAll/replace/replaceAll/search/split  │
│ • RegExp: test/exec                                      │
└──────────────────────────────────────────────────────────┘

深入理解正则表达式 ​

什么是正则表达式？ ​

正则基础语法速查 ​

字符类 ​

量词 ​

分组与引用 ​

标志（Flags） ​

断言（Assertions） ​

前瞻断言（Lookahead） ​

后瞻断言（Lookbehind，ES2018） ​

Unicode 支持 ​

u 标志 ​

v 标志（ES2024，Unicode Sets） ​

常用正则模式 ​

String 和 RegExp 的方法 ​

回溯原理与 ReDoS ​

NFA 引擎的回溯 ​

ReDoS（正则拒绝服务攻击） ​

ES2022+ 正则新特性 ​

hasIndices（d 标志，ES2022） ​

总结 ​

深入理解正则表达式

什么是正则表达式？

正则基础语法速查

字符类

量词

分组与引用

标志（Flags）

断言（Assertions）

前瞻断言（Lookahead）

后瞻断言（Lookbehind，ES2018）

Unicode 支持

u 标志

v 标志（ES2024，Unicode Sets）

常用正则模式

String 和 RegExp 的方法

回溯原理与 ReDoS

NFA 引擎的回溯

ReDoS（正则拒绝服务攻击）

ES2022+ 正则新特性

hasIndices（d 标志，ES2022）

总结