src/orders/ingestion/parsers/smart.parser.ts
Methods |
| parse | ||||||
parse(buffer: Buffer)
|
||||||
|
Parameters :
Returns :
literal type
|
import * as XLSX from 'xlsx';
import { Logger } from '@nestjs/common';
/**
* Smart deterministic parser for structured Excel workbooks.
* Extracts EVERY column — no data is skipped.
* Handles 100,000+ rows across multiple sheets in seconds.
* Captures section context (region headers like COAST, MOUNTAIN, etc.).
* Flags missing data per order.
*/
export interface SmartParsedOrder {
// === Core mapped fields ===
origin_address: string | null;
destination_address: string | null;
origin_contact_name: string | null;
origin_contact_phone: string | null;
dest_contact_name: string | null;
dest_contact_phone: string | null;
weight_kg: number | null;
volume_cbm: number | null;
pieces: number | null;
commodity: string | null;
vehicle_type: string | null;
pickup_date: string | null;
delivery_date: string | null;
special_instructions: string | null;
priority: string;
reference_number: string | null;
// === ALL 15 Excel columns ===
order_creation_date: string | null;
circulation_date: string | null;
plant: string | null;
distributor: string | null;
lpo_number: string | null;
beer_order_number: string | null;
udv_order_number: string | null;
pallets: number | null;
delivery_number_beer: string | null;
delivery_number_udv: string | null;
qty_beer: number | null;
qty_udv: number | null;
truck_plate: string | null;
status: string | null;
palletization: string | null;
// === Derived fields ===
trucks_needed: number; // How many 30T trucks this order needs
truck_capacity_kg: number; // Standard truck capacity (30,000 kg)
plant_full_name: string | null; // NI1 → "Nairobi (NI1)"
// === Context fields ===
region: string | null; // Section context: COAST, NAIROBI, etc.
sheet_date: string | null; // Sheet name (usually the date)
row_number: number; // Excel row number for traceability
// === Data quality ===
missing_fields: string[]; // Which critical fields are missing
data_warnings: string[]; // Issues found in this row
// === Raw data - preserves ALL columns including unmapped ===
raw_row: Record<string, unknown>;
}
// Section headers — these are region groupings, NOT data rows
// But we capture them as context for the orders that follow
const SECTION_PATTERNS: Record<string, RegExp> = {
'COAST': /^coast(\s+\w+)?$/i,
'NAIROBI': /^nairobi(\s+\w+)?$/i,
'MOUNTAIN': /^mountain(\s+\w+)?$/i,
'LAKE (NCD)': /^lake\s+.*n[ci]d/i,
'LAKE (KSM)': /^lake\s+.*ksm/i,
'LAKE': /^lake(\s+\w+)?$/i,
'WESTERN': /^western(\s+\w+)?$/i,
'CENTRAL': /^central(\s+\w+)?$/i,
'RIFT': /^rift\s*(valley)?(\s+\w+)?$/i,
'NYANZA': /^nyanza(\s+\w+)?$/i,
};
// Skip patterns — NOT data rows, NOT section headers
// NOTE: "PACKED" is NOT skipped — it's a section marker. Orders under PACKED are "not yet cleared"
const SKIP_ROW = /^(spills?\s*$|plan\s*$|post.*cut|eaml\s*$|total|sub\s*total|grand\s*total|\s*$)/i;
// PACKED is a special section — orders under it are not yet cleared (intentionally incomplete)
const PACKED_PATTERN = /^packed\s*$/i;
// Column name patterns — maps various header names to our field
const COLUMN_PATTERNS: Record<string, RegExp> = {
order_creation_date: /date.*order.*creation|order.*date|date.*created/i,
circulation_date: /date.*circulation|circulation.*date/i,
plant: /^plant$|^facility$|^loading.*facility$|^origin.*facility$/i,
distributor: /^distributor$|^site$|^delivery.*site$|^client.*site$|^consignee$/i,
lpo_number: /lpo.*no|lpo.*number|^lpo$/i,
beer_order: /beer.*order|beer_order|order.*ref.*a$|^ref.*a$|^order.*a$/i,
udv_order: /udv.*order|udv_order|order.*ref.*b$|^ref.*b$|^order.*b$/i,
pallets: /^pallet/i,
delivery_number_beer: /del.*no.*beer|delno.*beer|del.*#.*a$|^del.*a$/i,
delivery_number_udv: /del.*no.*udv|delno.*udv|del.*#.*b$|^del.*b$/i,
qty_beer: /qty.*beer|quantity.*beer|^qty.*a$|^qty_a$|^quantity.*a$/i,
qty_udv: /qty.*udv|quantity.*udv|^qty.*b$|^qty_b$|^quantity.*b$/i,
truck_allocation: /truck.*alloc|truck.*plate/i,
status: /^status$/i,
palletization: /palletiz|^packaging$/i,
// Generic transport columns (fallback)
origin: /origin|from|pickup.*location|source|loading.*point/i,
destination: /destination|to|delivery.*location/i,
weight: /weight|tonnage|^kg$|^tons$/i,
pieces_generic: /^pieces$|^qty$|^quantity$|^units$|^cases$|^crates$/i,
order_number: /order.*no|order.*number|order.*#|^ref/i,
commodity: /commodity|goods|product|description|cargo/i,
pickup_date_generic: /pickup.*date|collection.*date|load.*date/i,
delivery_date_generic: /delivery.*date|arrival.*date|^eta$/i,
vehicle_type: /vehicle.*type|truck.*type/i,
vehicle: /^vehicle$|^truck$/i,
priority: /priority|urgency/i,
lpo_generic: /^lpo$|purchase.*order|po.*no/i,
contact_name: /contact.*name|receiver|recipient/i,
contact_phone: /contact.*phone|phone|mobile|tel/i,
notes: /notes|remarks|comments|instructions/i,
};
export class SmartParser {
private readonly logger = new Logger(SmartParser.name);
parse(buffer: Buffer): { orders: SmartParsedOrder[]; summary: Record<string, unknown> } {
const workbook = XLSX.read(buffer, { type: 'buffer', cellDates: true });
const allOrders: SmartParsedOrder[] = [];
const sheetsProcessed: string[] = [];
const allMissingData: { sheet: string; row: number; fields: string[] }[] = [];
for (const sheetName of workbook.SheetNames) {
const worksheet = workbook.Sheets[sheetName];
const rows: unknown[][] = XLSX.utils.sheet_to_json(worksheet, {
header: 1,
defval: '',
blankrows: false,
});
if (rows.length < 2) continue;
// Find the header row
let headerRowIdx = -1;
let columnMap: Record<string, number> = {};
let headerRow: unknown[] = [];
for (let i = 0; i < Math.min(20, rows.length); i++) {
const row = rows[i];
if (!Array.isArray(row)) continue;
const mapped = this.mapColumns(row);
const matchCount = Object.keys(mapped).length;
if (matchCount >= 3 && matchCount > Object.keys(columnMap).length) {
headerRowIdx = i;
columnMap = mapped;
headerRow = row;
}
}
if (headerRowIdx === -1) {
this.logger.warn(`Sheet "${sheetName}": no header row found, skipping`);
continue;
}
sheetsProcessed.push(sheetName);
// Build full header names for raw_row mapping
const headerNames: string[] = headerRow.map((h, i) => {
const name = String(h || '').trim();
return name || `column_${i}`;
});
this.logger.log(
`Sheet "${sheetName}": header at row ${headerRowIdx}, ` +
`${Object.keys(columnMap).length} mapped, ${headerNames.filter(h => !h.startsWith('column_')).length} total columns`,
);
// Track current section/region context
let currentRegion: string | null = null;
let isPacked = false; // Orders under PACKED section are "not yet cleared"
// Process ALL rows after the header
for (let i = headerRowIdx + 1; i < rows.length; i++) {
const row = rows[i];
if (!Array.isArray(row)) continue;
// Get the distributor/destination value (fall back to plant/origin if needed)
const distCol = columnMap['distributor'] ?? columnMap['destination'] ?? columnMap['plant'] ?? columnMap['origin'];
const distVal = distCol !== undefined ? String(row[distCol] || '').trim() : '';
if (!distVal) continue;
// Check for PACKED section — orders under this are not yet cleared
if (PACKED_PATTERN.test(distVal)) {
isPacked = true;
continue;
}
// Check if this is a section header — capture as context, don't skip
const section = this.matchSection(distVal);
if (section) {
currentRegion = section;
isPacked = false; // New region section resets packed state
continue; // Section headers are NOT orders
}
// Skip non-data rows (totals, spills, plan markers)
if (SKIP_ROW.test(distVal)) continue;
// Import ALL rows — even incomplete ones. User can edit before approving.
// Only section headers and totals are skipped (handled above).
// Build raw_row with ALL columns preserved
const rawRow: Record<string, unknown> = {};
for (let col = 0; col < row.length; col++) {
const val = row[col];
if (val !== '' && val !== null && val !== undefined) {
rawRow[headerNames[col] || `column_${col}`] = val;
}
}
const order = this.extractOrder(row, columnMap, sheetName, currentRegion, i + 1, rawRow, isPacked);
if (order) {
allOrders.push(order);
if (order.missing_fields.length > 0) {
allMissingData.push({
sheet: sheetName,
row: i + 1,
fields: order.missing_fields,
});
}
}
}
}
// Build summary
let totalQtyBeer = 0;
let totalQtyUdv = 0;
let totalPallets = 0;
let totalWeightKg = 0;
const origins = new Set<string>();
const destinations = new Set<string>();
const regions = new Set<string>();
const statuses: Record<string, number> = {};
const truckPlates = new Set<string>();
const ordersWithMissingData = allOrders.filter(o => o.missing_fields.length > 0);
for (const o of allOrders) {
if (o.qty_beer) totalQtyBeer += o.qty_beer;
if (o.qty_udv) totalQtyUdv += o.qty_udv;
if (o.pallets) totalPallets += o.pallets;
if (o.weight_kg) totalWeightKg += o.weight_kg;
if (o.plant) origins.add(o.plant);
if (o.distributor) destinations.add(o.distributor);
if (o.region) regions.add(o.region);
if (o.truck_plate) truckPlates.add(o.truck_plate);
if (o.status) {
statuses[o.status] = (statuses[o.status] || 0) + 1;
}
}
this.logger.log(`Smart parser: ${allOrders.length} orders from ${sheetsProcessed.length} sheets`);
if (ordersWithMissingData.length > 0) {
this.logger.warn(`${ordersWithMissingData.length} orders have missing data`);
}
return {
orders: allOrders,
summary: {
total_orders: allOrders.length,
total_qty_beer: totalQtyBeer,
total_qty_udv: totalQtyUdv,
total_pallets: totalPallets,
total_weight_kg: totalWeightKg || null,
unique_origins: Array.from(origins),
unique_destinations: Array.from(destinations),
unique_regions: Array.from(regions),
unique_trucks: Array.from(truckPlates),
truck_count: truckPlates.size,
status_breakdown: statuses,
sheets_processed: sheetsProcessed,
orders_with_missing_data: ordersWithMissingData.length,
missing_data_details: allMissingData.slice(0, 50), // First 50 for review
warnings: ordersWithMissingData.length > 0
? [`${ordersWithMissingData.length} orders have missing critical data — review required`]
: [],
},
};
}
private matchSection(value: string): string | null {
for (const [label, pattern] of Object.entries(SECTION_PATTERNS)) {
if (pattern.test(value)) return label;
}
return null;
}
private mapColumns(headerRow: unknown[]): Record<string, number> {
const mapped: Record<string, number> = {};
for (let col = 0; col < headerRow.length; col++) {
const header = String(headerRow[col] || '').trim();
if (!header) continue;
for (const [field, pattern] of Object.entries(COLUMN_PATTERNS)) {
if (pattern.test(header) && !(field in mapped)) {
mapped[field] = col;
break;
}
}
}
return mapped;
}
private extractOrder(
row: unknown[],
columnMap: Record<string, number>,
sheetName: string,
region: string | null,
excelRow: number,
rawRow: Record<string, unknown>,
isPacked = false,
): SmartParsedOrder | null {
const get = (field: string): string => {
const col = columnMap[field];
if (col === undefined) return '';
const val = row[col];
if (val === null || val === undefined) return '';
if (val instanceof Date) return val.toISOString().split('T')[0];
return String(val).trim();
};
const getNum = (field: string): number | null => {
const val = get(field);
if (!val) return null;
const num = parseFloat(val.replace(/[,\s]/g, ''));
return isNaN(num) ? null : num;
};
// === Extract ALL 15 columns ===
const orderCreationDate = get('order_creation_date') || get('pickup_date_generic');
const circulationDate = get('circulation_date') || get('delivery_date_generic');
const plant = get('plant') || get('origin');
const distributor = get('distributor') || get('destination');
const lpoNumber = get('lpo_number') || get('lpo_generic') || get('order_number');
const beerOrder = get('beer_order');
const udvOrder = get('udv_order');
const palletsVal = getNum('pallets');
const delNoBeer = get('delivery_number_beer');
const delNoUdv = get('delivery_number_udv');
const qtyBeer = getNum('qty_beer');
const qtyUdv = getNum('qty_udv');
const truckAllocation = get('truck_allocation') || get('vehicle');
const status = get('status');
const palletization = get('palletization');
// Must have at least a distributor to be a valid order
if (!distributor) return null;
// === Compute derived fields ===
const totalPieces = (qtyBeer || 0) + (qtyUdv || 0);
let commodity: string | null = null;
if (qtyBeer && qtyUdv) commodity = 'Mixed Goods';
else if (qtyBeer) commodity = 'Goods A';
else if (qtyUdv) commodity = 'Goods B';
// Weight calculation from case quantities
// EABL/KBL: 1 beer case (12x500ml) ≈ 11 kg, 1 UDV case ≈ 9 kg
// All trucks are 30-tonne capacity
const BEER_CASE_KG = 11;
const UDV_CASE_KG = 9;
const TRUCK_CAPACITY_KG = 30000; // All trucks are 30 tonnes
let weightKg = getNum('weight');
if (weightKg === null && (qtyBeer || qtyUdv)) {
weightKg = Math.round(((qtyBeer || 0) * BEER_CASE_KG) + ((qtyUdv || 0) * UDV_CASE_KG));
}
// Calculate how many 30T trucks this order needs
const trucksNeeded = weightKg ? Math.ceil(weightKg / TRUCK_CAPACITY_KG) : 1;
const priority = status?.toLowerCase() === 'packed' ? 'LOW' : 'NORMAL';
// Facility code mapping
const FACILITY_NAMES: Record<string, string> = {
'NI1': 'Nairobi Warehouse',
'KBS2': 'Kisumu Depot',
};
const plantFullName = plant ? (FACILITY_NAMES[plant.toUpperCase()] || plant) : null;
// === Track missing critical data ===
const missingFields: string[] = [];
const dataWarnings: string[] = [];
if (!plant) missingFields.push('FACILITY');
if (!lpoNumber) missingFields.push('LPO NO.');
if (!orderCreationDate) missingFields.push('ORDER DATE');
if (!circulationDate) missingFields.push('DELIVERY DATE');
// TRUCK ALLOCATION is NOT missing — system will auto-allocate
if (!status) missingFields.push('STATUS');
if (qtyBeer === null && qtyUdv === null) missingFields.push('QTY (A or B)');
if (!beerOrder && !udvOrder) missingFields.push('ORDER REF (A or B)');
if (!delNoBeer && !delNoUdv) missingFields.push('DELIVERY NUMBER (A or B)');
// PACKED section — order not yet cleared, intentionally incomplete
if (isPacked) {
dataWarnings.push('NOT CLEARED — order is under PACKED section (not yet dispatched by client)');
// Missing delivery numbers are expected for packed orders
const idx = missingFields.indexOf('DELIVERY NUMBER (A or B)');
if (idx >= 0) missingFields.splice(idx, 1);
}
// Data quality warnings
if (totalPieces === 0 && !isPacked) dataWarnings.push('Zero quantity — verify order is real');
if (status && !['LOADED', 'PACKED', 'DISPATCHED', 'DELIVERED', 'PENDING'].includes(status.toUpperCase())) {
dataWarnings.push(`Unusual status: "${status}"`);
}
if (truckAllocation && !/^K[A-Z]{2}/i.test(truckAllocation)) {
dataWarnings.push(`Truck plate may be invalid: "${truckAllocation}"`);
}
return {
// Core mapped — with proper business meaning
origin_address: plantFullName || plant || null, // Loading location
destination_address: distributor || null, // Delivery destination
origin_contact_name: get('contact_name') || null,
origin_contact_phone: get('contact_phone') || null,
dest_contact_name: null,
dest_contact_phone: null,
weight_kg: weightKg, // Calculated from qty × case weight
volume_cbm: null,
pieces: totalPieces || null, // Total cases (beer + UDV)
commodity,
vehicle_type: 'DRY_VAN', // All trucks are 30T dry vans
pickup_date: orderCreationDate || null, // When order came in
delivery_date: circulationDate || null, // Circulation date
special_instructions: [palletization, get('notes')].filter(Boolean).join('; ') || null,
priority,
reference_number: lpoNumber || null,
trucks_needed: trucksNeeded, // How many 30T trucks needed
truck_capacity_kg: TRUCK_CAPACITY_KG,
// ALL 15 Excel columns — exact values
order_creation_date: orderCreationDate || null, // When the order came
circulation_date: circulationDate || null, // Circulation date
plant: plant || null, // Loading point (NI1/KBS2)
plant_full_name: plantFullName, // Full name (Nairobi/Kisumu)
distributor: distributor || null, // Delivery destination
lpo_number: lpoNumber || null,
beer_order_number: beerOrder || null,
udv_order_number: udvOrder || null,
pallets: palletsVal,
delivery_number_beer: delNoBeer || null,
delivery_number_udv: delNoUdv || null,
qty_beer: qtyBeer,
qty_udv: qtyUdv,
truck_plate: truckAllocation || null, // From Excel — suggestion only, system auto-allocates
status: status || null, // LOADED, PACKED, etc.
palletization: palletization || null,
// Context
region: region || null,
sheet_date: sheetName,
row_number: excelRow,
// Data quality
missing_fields: missingFields,
data_warnings: dataWarnings,
// Raw - ALL columns preserved
raw_row: rawRow,
};
}
}