fix: keyword matching with non-word boundary characters

Keywords starting/ending with non-word characters (brackets, parens,
dashes) never matched because \b requires a word↔non-word transition.
Now uses smart boundaries: \b for word-char edges, (?<=\s|^)/(?=\s|$)
for non-word edges. Also pre-compiles regex patterns once per batch
instead of recreating them for every description × keyword combination.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
le king fu 2026-02-25 20:03:04 -05:00
parent 8388b08a84
commit 4938bba3f3

View file

@ -16,11 +16,66 @@ function normalizeDescription(desc: string): string {
.trim(); .trim();
} }
const WORD_CHAR = /\w/;
/**
* Build a regex pattern for a keyword with smart boundaries.
* Uses \b when the keyword edge is a word character (a-z, 0-9, _),
* and uses (?<=\s|^) / (?=\s|$) when the edge is a non-word character
* (e.g., brackets, parentheses, dashes). This ensures keywords like
* "[VIREMENT]" or "(INTERAC)" can match correctly.
*/
function buildKeywordRegex(normalizedKeyword: string): RegExp {
const escaped = normalizedKeyword.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
const left = WORD_CHAR.test(normalizedKeyword[0])
? "\\b"
: "(?<=\\s|^)";
const right = WORD_CHAR.test(normalizedKeyword[normalizedKeyword.length - 1])
? "\\b"
: "(?=\\s|$)";
return new RegExp(`${left}${escaped}${right}`);
}
interface CategorizationResult { interface CategorizationResult {
category_id: number | null; category_id: number | null;
supplier_id: number | null; supplier_id: number | null;
} }
interface CompiledKeyword {
regex: RegExp;
category_id: number;
supplier_id: number | null;
}
/**
* Compile keywords into regex patterns once for reuse across multiple matches.
*/
function compileKeywords(keywords: Keyword[]): CompiledKeyword[] {
return keywords.map((kw) => ({
regex: buildKeywordRegex(normalizeDescription(kw.keyword)),
category_id: kw.category_id,
supplier_id: kw.supplier_id ?? null,
}));
}
/**
* Match a normalized description against compiled keywords.
*/
function matchDescription(
normalized: string,
compiled: CompiledKeyword[]
): CategorizationResult {
for (const kw of compiled) {
if (kw.regex.test(normalized)) {
return {
category_id: kw.category_id,
supplier_id: kw.supplier_id,
};
}
}
return { category_id: null, supplier_id: null };
}
/** /**
* Auto-categorize a single transaction description. * Auto-categorize a single transaction description.
* Returns matching category_id and supplier_id, or nulls if no match. * Returns matching category_id and supplier_id, or nulls if no match.
@ -33,20 +88,9 @@ export async function categorizeDescription(
"SELECT * FROM keywords WHERE is_active = 1 ORDER BY priority DESC" "SELECT * FROM keywords WHERE is_active = 1 ORDER BY priority DESC"
); );
const compiled = compileKeywords(keywords);
const normalized = normalizeDescription(description); const normalized = normalizeDescription(description);
return matchDescription(normalized, compiled);
for (const kw of keywords) {
const normalizedKeyword = normalizeDescription(kw.keyword);
const escaped = normalizedKeyword.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
if (new RegExp(`\\b${escaped}\\b`).test(normalized)) {
return {
category_id: kw.category_id,
supplier_id: kw.supplier_id ?? null,
};
}
}
return { category_id: null, supplier_id: null };
} }
/** /**
@ -61,18 +105,10 @@ export async function categorizeBatch(
"SELECT * FROM keywords WHERE is_active = 1 ORDER BY priority DESC" "SELECT * FROM keywords WHERE is_active = 1 ORDER BY priority DESC"
); );
const compiled = compileKeywords(keywords);
return descriptions.map((desc) => { return descriptions.map((desc) => {
const normalized = normalizeDescription(desc); const normalized = normalizeDescription(desc);
for (const kw of keywords) { return matchDescription(normalized, compiled);
const normalizedKeyword = normalizeDescription(kw.keyword);
const escaped = normalizedKeyword.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
if (new RegExp(`\\b${escaped}\\b`).test(normalized)) {
return {
category_id: kw.category_id,
supplier_id: kw.supplier_id ?? null,
};
}
}
return { category_id: null, supplier_id: null };
}); });
} }