Simpl-Resultat/src/utils/csvAutoDetect.ts
Le-King-Fu 474c7b947a
Some checks failed
Release / build (windows-latest) (push) Has been cancelled
fix: CSV import bugs and dashboard category filtering
- Fix column display for Desjardins-style quoted CSVs (apply preprocessQuotedCSV in header loading)
- Fix column mapping disappearing on back-navigation (generate synthetic headers when hasHeader is false)
- Fix auto-detect picking account number as amount (exclude constant-value columns, treat 0 as empty in debit/credit detection)
- Use category type instead of amount sign for dashboard pie chart and recent transactions

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-12 00:09:10 +00:00

467 lines
12 KiB
TypeScript

import Papa from "papaparse";
import { parseDate } from "./dateParser";
import { parseFrenchAmount } from "./amountParser";
import type { ColumnMapping, AmountMode, SignConvention } from "../shared/types";
export interface AutoDetectResult {
delimiter: string;
hasHeader: boolean;
skipLines: number;
dateFormat: string;
columnMapping: ColumnMapping;
amountMode: AmountMode;
signConvention: SignConvention;
}
const DATE_FORMATS = [
"DD/MM/YYYY",
"MM/DD/YYYY",
"YYYY-MM-DD",
"YYYY/MM/DD",
"DD-MM-YYYY",
"DD.MM.YYYY",
"YYYYMMDD",
];
const DELIMITERS = [",", ";", "\t"];
/**
* Detect and unwrap Desjardins-style CSVs where each entire line is
* wrapped in quotes with "" escaping inside.
*/
export function preprocessQuotedCSV(content: string): string {
const lines = content.split(/\r?\n/);
const nonEmpty = lines.filter((l) => l.trim());
if (nonEmpty.length === 0) return content;
const isLineQuoted = nonEmpty.every((l) => {
const t = l.trim();
return t.startsWith('"') && t.endsWith('"') && t.includes(',""');
});
if (!isLineQuoted) return content;
return lines
.map((l) => {
const t = l.trim();
if (!t) return "";
return t.slice(1, -1).replace(/""/g, '"');
})
.join("\n");
}
/**
* Analyze raw CSV content and return a suggested configuration,
* or null if detection fails.
*/
export function autoDetectConfig(rawContent: string): AutoDetectResult | null {
const content = preprocessQuotedCSV(rawContent);
const lines = content.split(/\r?\n/).filter((l) => l.trim());
if (lines.length < 2) return null;
// Step 1: Detect delimiter
const delimiter = detectDelimiter(lines.slice(0, 10));
if (!delimiter) return null;
const parsed = Papa.parse(content, { delimiter, skipEmptyLines: true });
const data = parsed.data as string[][];
if (data.length < 2) return null;
// Step 2: Detect header
const hasHeader = detectHeader(data[0]);
const dataStartIdx = hasHeader ? 1 : 0;
const sampleRows = data.slice(dataStartIdx, dataStartIdx + 20);
if (sampleRows.length === 0) return null;
const colCount = Math.max(...data.slice(0, 10).map((r) => r.length));
// Step 3: Detect date column + format
const dateResult = detectDateColumn(sampleRows, colCount);
if (!dateResult) return null;
// Step 4: Detect numeric columns
const numericCols = detectNumericColumns(sampleRows, colCount);
// Step 5: Detect balance columns and exclude them
const balanceCols = detectBalanceColumns(sampleRows, numericCols);
const amountCandidates = numericCols.filter((c) => !balanceCols.has(c));
// Step 6: Detect description column
const descriptionCol = detectDescriptionColumn(
sampleRows,
colCount,
dateResult.column,
new Set([...numericCols])
);
// Step 7: Determine amount mode
const amountResult = detectAmountMode(sampleRows, amountCandidates);
if (!amountResult) return null;
const mapping: ColumnMapping = {
date: dateResult.column,
description: descriptionCol,
};
let signConvention: SignConvention = "negative_expense";
if (amountResult.mode === "debit_credit") {
mapping.debitAmount = amountResult.debitCol;
mapping.creditAmount = amountResult.creditCol;
} else {
mapping.amount = amountResult.amountCol;
signConvention = amountResult.signConvention;
}
return {
delimiter,
hasHeader,
skipLines: 0,
dateFormat: dateResult.format,
columnMapping: mapping,
amountMode: amountResult.mode,
signConvention,
};
}
function detectDelimiter(lines: string[]): string | null {
let bestDelimiter: string | null = null;
let bestScore = 0;
for (const delim of DELIMITERS) {
const counts = lines.map(
(line) =>
Papa.parse(line, { delimiter: delim }).data[0] as string[]
).map((row) => row.length);
// All lines should give consistent column count > 1
if (counts.length === 0 || counts[0] <= 1) continue;
const firstCount = counts[0];
const consistent = counts.filter((c) => c === firstCount).length;
const score = (consistent / counts.length) * firstCount;
if (score > bestScore) {
bestScore = score;
bestDelimiter = delim;
}
}
return bestDelimiter;
}
function detectHeader(firstRow: string[]): boolean {
// A header row typically has no parseable dates and no parseable numbers
let hasDate = false;
let hasNumber = false;
for (const cell of firstRow) {
const trimmed = cell?.trim();
if (!trimmed) continue;
// Check for number
if (!isNaN(parseFrenchAmount(trimmed))) {
hasNumber = true;
}
// Check for date
for (const fmt of DATE_FORMATS) {
if (parseDate(trimmed, fmt)) {
hasDate = true;
break;
}
}
}
return !hasDate && !hasNumber;
}
function detectDateColumn(
rows: string[][],
colCount: number
): { column: number; format: string } | null {
let bestCol = -1;
let bestFormat = "";
let bestRate = 0;
for (let col = 0; col < colCount; col++) {
for (const fmt of DATE_FORMATS) {
let success = 0;
let total = 0;
for (const row of rows) {
const cell = row[col]?.trim();
if (!cell) continue;
total++;
if (parseDate(cell, fmt)) {
success++;
}
}
if (total === 0) continue;
const rate = success / total;
if (rate > bestRate) {
bestRate = rate;
bestCol = col;
bestFormat = fmt;
}
}
}
if (bestRate < 0.8 || bestCol < 0) return null;
return { column: bestCol, format: bestFormat };
}
function detectNumericColumns(rows: string[][], colCount: number): number[] {
const result: number[] = [];
for (let col = 0; col < colCount; col++) {
let numericCount = 0;
let nonEmpty = 0;
const distinctValues = new Set<number>();
for (const row of rows) {
const cell = row[col]?.trim();
if (!cell) continue;
nonEmpty++;
const val = parseFrenchAmount(cell);
if (!isNaN(val)) {
numericCount++;
distinctValues.add(val);
}
}
if (nonEmpty > 0 && numericCount / nonEmpty >= 0.5) {
// Exclude constant-value columns (e.g., account numbers, transit numbers)
if (distinctValues.size <= 1 && nonEmpty > 2) continue;
result.push(col);
}
}
return result;
}
function detectBalanceColumns(
rows: string[][],
numericCols: number[]
): Set<number> {
const balanceCols = new Set<number>();
if (numericCols.length < 2 || rows.length < 3) return balanceCols;
const TOLERANCE = 0.015; // tolerance for floating-point comparison
// Parse all numeric values once
const values: Map<number, (number | null)[]> = new Map();
for (const col of numericCols) {
values.set(
col,
rows.map((row) => {
const cell = row[col]?.trim();
if (!cell) return null;
const v = parseFrenchAmount(cell);
return isNaN(v) ? null : v;
})
);
}
for (const balCol of numericCols) {
const balVals = values.get(balCol)!;
// Test single-column balance: balance[i] ≈ balance[i-1] ± amount[i]
for (const amtCol of numericCols) {
if (amtCol === balCol) continue;
const amtVals = values.get(amtCol)!;
let matches = 0;
let tested = 0;
for (let i = 1; i < rows.length; i++) {
if (balVals[i] === null || balVals[i - 1] === null || amtVals[i] === null)
continue;
tested++;
const diff = balVals[i]! - balVals[i - 1]!;
// balance[i] = balance[i-1] + amount[i] OR balance[i] = balance[i-1] - amount[i]
if (
Math.abs(diff - amtVals[i]!) < TOLERANCE ||
Math.abs(diff + amtVals[i]!) < TOLERANCE
) {
matches++;
}
}
if (tested >= 2 && matches / tested >= 0.8) {
balanceCols.add(balCol);
break;
}
}
if (balanceCols.has(balCol)) continue;
// Test two-column balance: balance[i] ≈ balance[i-1] - debit[i] + credit[i]
for (let a = 0; a < numericCols.length; a++) {
for (let b = a + 1; b < numericCols.length; b++) {
const colA = numericCols[a];
const colB = numericCols[b];
if (colA === balCol || colB === balCol) continue;
const valsA = values.get(colA)!;
const valsB = values.get(colB)!;
let matches = 0;
let tested = 0;
for (let i = 1; i < rows.length; i++) {
if (balVals[i] === null || balVals[i - 1] === null) continue;
const da = valsA[i] ?? 0;
const db = valsB[i] ?? 0;
tested++;
const diff = balVals[i]! - balVals[i - 1]!;
// Try both orderings: diff ≈ -colA + colB or diff ≈ colA - colB
if (
Math.abs(diff - (-da + db)) < TOLERANCE ||
Math.abs(diff - (da - db)) < TOLERANCE
) {
matches++;
}
}
if (tested >= 2 && matches / tested >= 0.8) {
balanceCols.add(balCol);
break;
}
}
if (balanceCols.has(balCol)) break;
}
}
return balanceCols;
}
function detectDescriptionColumn(
rows: string[][],
colCount: number,
dateCol: number,
numericCols: Set<number>
): number {
let bestCol = 0;
let bestAvgLen = 0;
for (let col = 0; col < colCount; col++) {
if (col === dateCol || numericCols.has(col)) continue;
let totalLen = 0;
let count = 0;
for (const row of rows) {
const cell = row[col]?.trim();
if (!cell) continue;
totalLen += cell.length;
count++;
}
const avgLen = count > 0 ? totalLen / count : 0;
if (avgLen > bestAvgLen) {
bestAvgLen = avgLen;
bestCol = col;
}
}
return bestCol;
}
interface SingleAmountResult {
mode: "single";
amountCol: number;
signConvention: SignConvention;
}
interface DebitCreditResult {
mode: "debit_credit";
debitCol: number;
creditCol: number;
}
type AmountModeResult = SingleAmountResult | DebitCreditResult;
function detectAmountMode(
rows: string[][],
amountCandidates: number[]
): AmountModeResult | null {
if (amountCandidates.length === 0) return null;
if (amountCandidates.length === 1) {
return detectSingleAmount(rows, amountCandidates[0]);
}
// Check for sparse-complementary pair (debit/credit pattern)
for (let a = 0; a < amountCandidates.length; a++) {
for (let b = a + 1; b < amountCandidates.length; b++) {
const colA = amountCandidates[a];
const colB = amountCandidates[b];
if (isSparseComplementary(rows, colA, colB)) {
return { mode: "debit_credit", debitCol: colA, creditCol: colB };
}
}
}
// No complementary pair found — use first candidate as single amount
return detectSingleAmount(rows, amountCandidates[0]);
}
function detectSingleAmount(
rows: string[][],
col: number
): SingleAmountResult {
let negCount = 0;
let total = 0;
for (const row of rows) {
const cell = row[col]?.trim();
if (!cell) continue;
const val = parseFrenchAmount(cell);
if (isNaN(val)) continue;
total++;
if (val < 0) negCount++;
}
// If most values are negative, they likely represent expenses as negative
const signConvention: SignConvention =
total > 0 && negCount / total > 0.5
? "negative_expense"
: "positive_expense";
return { mode: "single", amountCol: col, signConvention };
}
function isSparseComplementary(
rows: string[][],
colA: number,
colB: number
): boolean {
let complementary = 0;
let total = 0;
for (const row of rows) {
const cellA = row[colA]?.trim();
const cellB = row[colB]?.trim();
const valA = cellA ? parseFrenchAmount(cellA) : NaN;
const valB = cellB ? parseFrenchAmount(cellB) : NaN;
const hasA = !isNaN(valA) && valA !== 0;
const hasB = !isNaN(valB) && valB !== 0;
if (!hasA && !hasB) continue;
total++;
// Complementary: exactly one has a value
if (hasA !== hasB) {
complementary++;
}
}
return total > 0 && complementary / total >= 0.7;
}