function tokenizeZH(text) { const segmenter = new Intl.Segmenter('zh', { granularity: 'word' }); const segments = segmenter.segment(text); const words = []; for (const { segment /* , index, isWordLike */ } of segments) { words.push(segment); } return words; } console.log(tokenizeZH('我不是太清楚'));
Live: https://jsfiddle.net/rgqen1zc/
Output:
["我不是", "太", "清楚"]
我不是 should be 我 不是
No comments:
Post a Comment