fix: keyword matching with non-word boundary characters
Keywords starting/ending with non-word characters (brackets, parens, dashes) never matched because \b requires a word↔non-word transition. Now uses smart boundaries: \b for word-char edges, (?<=\s|^)/(?=\s|$) for non-word edges. Also pre-compiles regex patterns once per batch instead of recreating them for every description × keyword combination. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
8388b08a84
commit
4938bba3f3
1 changed files with 60 additions and 24 deletions
|
|
@ -16,11 +16,66 @@ function normalizeDescription(desc: string): string {
|
||||||
.trim();
|
.trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const WORD_CHAR = /\w/;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build a regex pattern for a keyword with smart boundaries.
|
||||||
|
* Uses \b when the keyword edge is a word character (a-z, 0-9, _),
|
||||||
|
* and uses (?<=\s|^) / (?=\s|$) when the edge is a non-word character
|
||||||
|
* (e.g., brackets, parentheses, dashes). This ensures keywords like
|
||||||
|
* "[VIREMENT]" or "(INTERAC)" can match correctly.
|
||||||
|
*/
|
||||||
|
function buildKeywordRegex(normalizedKeyword: string): RegExp {
|
||||||
|
const escaped = normalizedKeyword.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
||||||
|
const left = WORD_CHAR.test(normalizedKeyword[0])
|
||||||
|
? "\\b"
|
||||||
|
: "(?<=\\s|^)";
|
||||||
|
const right = WORD_CHAR.test(normalizedKeyword[normalizedKeyword.length - 1])
|
||||||
|
? "\\b"
|
||||||
|
: "(?=\\s|$)";
|
||||||
|
return new RegExp(`${left}${escaped}${right}`);
|
||||||
|
}
|
||||||
|
|
||||||
interface CategorizationResult {
|
interface CategorizationResult {
|
||||||
category_id: number | null;
|
category_id: number | null;
|
||||||
supplier_id: number | null;
|
supplier_id: number | null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
interface CompiledKeyword {
|
||||||
|
regex: RegExp;
|
||||||
|
category_id: number;
|
||||||
|
supplier_id: number | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compile keywords into regex patterns once for reuse across multiple matches.
|
||||||
|
*/
|
||||||
|
function compileKeywords(keywords: Keyword[]): CompiledKeyword[] {
|
||||||
|
return keywords.map((kw) => ({
|
||||||
|
regex: buildKeywordRegex(normalizeDescription(kw.keyword)),
|
||||||
|
category_id: kw.category_id,
|
||||||
|
supplier_id: kw.supplier_id ?? null,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Match a normalized description against compiled keywords.
|
||||||
|
*/
|
||||||
|
function matchDescription(
|
||||||
|
normalized: string,
|
||||||
|
compiled: CompiledKeyword[]
|
||||||
|
): CategorizationResult {
|
||||||
|
for (const kw of compiled) {
|
||||||
|
if (kw.regex.test(normalized)) {
|
||||||
|
return {
|
||||||
|
category_id: kw.category_id,
|
||||||
|
supplier_id: kw.supplier_id,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return { category_id: null, supplier_id: null };
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Auto-categorize a single transaction description.
|
* Auto-categorize a single transaction description.
|
||||||
* Returns matching category_id and supplier_id, or nulls if no match.
|
* Returns matching category_id and supplier_id, or nulls if no match.
|
||||||
|
|
@ -33,20 +88,9 @@ export async function categorizeDescription(
|
||||||
"SELECT * FROM keywords WHERE is_active = 1 ORDER BY priority DESC"
|
"SELECT * FROM keywords WHERE is_active = 1 ORDER BY priority DESC"
|
||||||
);
|
);
|
||||||
|
|
||||||
|
const compiled = compileKeywords(keywords);
|
||||||
const normalized = normalizeDescription(description);
|
const normalized = normalizeDescription(description);
|
||||||
|
return matchDescription(normalized, compiled);
|
||||||
for (const kw of keywords) {
|
|
||||||
const normalizedKeyword = normalizeDescription(kw.keyword);
|
|
||||||
const escaped = normalizedKeyword.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
||||||
if (new RegExp(`\\b${escaped}\\b`).test(normalized)) {
|
|
||||||
return {
|
|
||||||
category_id: kw.category_id,
|
|
||||||
supplier_id: kw.supplier_id ?? null,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return { category_id: null, supplier_id: null };
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -61,18 +105,10 @@ export async function categorizeBatch(
|
||||||
"SELECT * FROM keywords WHERE is_active = 1 ORDER BY priority DESC"
|
"SELECT * FROM keywords WHERE is_active = 1 ORDER BY priority DESC"
|
||||||
);
|
);
|
||||||
|
|
||||||
|
const compiled = compileKeywords(keywords);
|
||||||
|
|
||||||
return descriptions.map((desc) => {
|
return descriptions.map((desc) => {
|
||||||
const normalized = normalizeDescription(desc);
|
const normalized = normalizeDescription(desc);
|
||||||
for (const kw of keywords) {
|
return matchDescription(normalized, compiled);
|
||||||
const normalizedKeyword = normalizeDescription(kw.keyword);
|
|
||||||
const escaped = normalizedKeyword.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
||||||
if (new RegExp(`\\b${escaped}\\b`).test(normalized)) {
|
|
||||||
return {
|
|
||||||
category_id: kw.category_id,
|
|
||||||
supplier_id: kw.supplier_id ?? null,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return { category_id: null, supplier_id: null };
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue