import { getDb } from "./db"; import type { Keyword, RecentTransaction } from "../shared/types"; /** * Normalize a description for keyword matching: * - lowercase * - strip accents via NFD decomposition * - collapse whitespace */ export function normalizeDescription(desc: string): string { return desc .normalize("NFD") .replace(/[\u0300-\u036f]/g, "") .toLowerCase() .replace(/\s+/g, " ") .trim(); } const WORD_CHAR = /\w/; /** * Build a regex pattern for a keyword with smart boundaries. * Uses \b when the keyword edge is a word character (a-z, 0-9, _), * and uses (?<=\s|^) / (?=\s|$) when the edge is a non-word character * (e.g., brackets, parentheses, dashes). This ensures keywords like * "[VIREMENT]" or "(INTERAC)" can match correctly. */ export function buildKeywordRegex(normalizedKeyword: string): RegExp { const escaped = normalizedKeyword.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); const left = WORD_CHAR.test(normalizedKeyword[0]) ? "\\b" : "(?<=\\s|^)"; const right = WORD_CHAR.test(normalizedKeyword[normalizedKeyword.length - 1]) ? "\\b" : "(?=\\s|$)"; return new RegExp(`${left}${escaped}${right}`); } interface CategorizationResult { category_id: number | null; supplier_id: number | null; } interface CompiledKeyword { regex: RegExp; category_id: number; supplier_id: number | null; } /** * Compile keywords into regex patterns once for reuse across multiple matches. */ export function compileKeywords(keywords: Keyword[]): CompiledKeyword[] { return keywords.map((kw) => ({ regex: buildKeywordRegex(normalizeDescription(kw.keyword)), category_id: kw.category_id, supplier_id: kw.supplier_id ?? null, })); } /** * Match a normalized description against compiled keywords. */ function matchDescription( normalized: string, compiled: CompiledKeyword[] ): CategorizationResult { for (const kw of compiled) { if (kw.regex.test(normalized)) { return { category_id: kw.category_id, supplier_id: kw.supplier_id, }; } } return { category_id: null, supplier_id: null }; } /** * Auto-categorize a single transaction description. * Returns matching category_id and supplier_id, or nulls if no match. */ export async function categorizeDescription( description: string ): Promise { const db = await getDb(); const keywords = await db.select( "SELECT * FROM keywords WHERE is_active = 1 ORDER BY priority DESC" ); const compiled = compileKeywords(keywords); const normalized = normalizeDescription(description); return matchDescription(normalized, compiled); } /** * Auto-categorize a batch of transactions (by their descriptions). * Returns an array of results in the same order. */ export async function categorizeBatch( descriptions: string[] ): Promise { const db = await getDb(); const keywords = await db.select( "SELECT * FROM keywords WHERE is_active = 1 ORDER BY priority DESC" ); const compiled = compileKeywords(keywords); return descriptions.map((desc) => { const normalized = normalizeDescription(desc); return matchDescription(normalized, compiled); }); } // --- AddKeywordDialog support (Issue #74) --- export const KEYWORD_MIN_LENGTH = 2; export const KEYWORD_MAX_LENGTH = 64; export const KEYWORD_PREVIEW_LIMIT = 50; /** * Validate a keyword before it hits the regex engine. * * Rejects whitespace-only input and caps length at 64 chars to prevent * ReDoS (CWE-1333) when the compiled regex is replayed across many * transactions later. */ export function validateKeyword(raw: string): { ok: true; value: string } | { ok: false; reason: "tooShort" | "tooLong" } { const trimmed = raw.trim(); if (trimmed.length < KEYWORD_MIN_LENGTH) return { ok: false, reason: "tooShort" }; if (trimmed.length > KEYWORD_MAX_LENGTH) return { ok: false, reason: "tooLong" }; return { ok: true, value: trimmed }; } /** * Preview the transactions that would be recategorised if the user commits * the given keyword. Uses a parameterised `LIKE ?1` to scope the candidates, * then re-filters in memory with `buildKeywordRegex` for exact word-boundary * matching. Results are capped at `limit` visible rows — callers decide what * to do with the `totalMatches` (which may be greater than the returned list). * * SECURITY: the keyword is never interpolated into the SQL string. `LIKE ?1` * is the only parameterised binding, and the `%...%` wrapping happens inside * the bound parameter value. */ export async function previewKeywordMatches( keyword: string, limit: number = KEYWORD_PREVIEW_LIMIT, ): Promise<{ visible: RecentTransaction[]; totalMatches: number }> { const validation = validateKeyword(keyword); if (!validation.ok) { return { visible: [], totalMatches: 0 }; } const normalized = normalizeDescription(validation.value); const regex = buildKeywordRegex(normalized); const db = await getDb(); // Coarse pre-filter via parameterised LIKE (case-insensitive thanks to // normalize on the JS side). A small cap protects against catastrophic // backtracking across a huge candidate set — hard-capped to 1000 rows // before the in-memory filter. const likePattern = `%${normalized}%`; const candidates = await db.select( `SELECT t.id, t.date, t.description, t.amount, c.name AS category_name, c.color AS category_color FROM transactions t LEFT JOIN categories c ON t.category_id = c.id WHERE LOWER(t.description) LIKE $1 ORDER BY t.date DESC LIMIT 1000`, [likePattern], ); const matched: RecentTransaction[] = []; for (const tx of candidates) { const normDesc = normalizeDescription(tx.description); if (regex.test(normDesc)) matched.push(tx); } return { visible: matched.slice(0, limit), totalMatches: matched.length, }; } export interface ApplyKeywordInput { keyword: string; categoryId: number; /** ids of transactions to recategorise (only those the user checked). */ transactionIds: number[]; /** * When true, and a keyword with the same spelling already exists for a * different category, that existing keyword is **reassigned** to the new * category rather than creating a duplicate. Matches the spec decision * that history is never touched — only the visible transactions are * recategorised. */ allowReplaceExisting: boolean; } export interface ApplyKeywordResult { keywordId: number; updatedTransactions: number; replacedExisting: boolean; } /** * INSERTs (or reassigns) a keyword and recategorises the given transaction * ids in a single SQL transaction. Either all writes commit or none do. * * SECURITY: every query is parameterised. The caller is expected to have * vetted `transactionIds` from a preview window that the user confirmed. */ export async function applyKeywordWithReassignment( input: ApplyKeywordInput, ): Promise { const validation = validateKeyword(input.keyword); if (!validation.ok) { throw new Error(`invalid_keyword:${validation.reason}`); } const keyword = validation.value; const db = await getDb(); await db.execute("BEGIN"); try { // Is there already a row for this keyword spelling? const existing = await db.select>( `SELECT id, category_id FROM keywords WHERE keyword = $1 LIMIT 1`, [keyword], ); let keywordId: number; let replacedExisting = false; if (existing.length > 0) { if (!input.allowReplaceExisting && existing[0].category_id !== input.categoryId) { throw new Error("keyword_already_exists"); } await db.execute( `UPDATE keywords SET category_id = $1, is_active = 1 WHERE id = $2`, [input.categoryId, existing[0].id], ); keywordId = existing[0].id; replacedExisting = existing[0].category_id !== input.categoryId; } else { const result = await db.execute( `INSERT INTO keywords (keyword, category_id, priority) VALUES ($1, $2, $3)`, [keyword, input.categoryId, 100], ); keywordId = Number(result.lastInsertId ?? 0); } let updatedTransactions = 0; for (const txId of input.transactionIds) { await db.execute( `UPDATE transactions SET category_id = $1, is_manually_categorized = 1, updated_at = CURRENT_TIMESTAMP WHERE id = $2`, [input.categoryId, txId], ); updatedTransactions++; } await db.execute("COMMIT"); return { keywordId, updatedTransactions, replacedExisting }; } catch (e) { await db.execute("ROLLBACK"); throw e; } }