File

src/orders/ingestion/parsers/smart.parser.ts

Index

Methods

Methods

parse
parse(buffer: Buffer)
Parameters :
Name Type Optional
buffer Buffer No
Returns : literal type
import * as XLSX from 'xlsx';
import { Logger } from '@nestjs/common';

/**
 * Smart deterministic parser for structured Excel workbooks.
 * Extracts EVERY column — no data is skipped.
 * Handles 100,000+ rows across multiple sheets in seconds.
 * Captures section context (region headers like COAST, MOUNTAIN, etc.).
 * Flags missing data per order.
 */

export interface SmartParsedOrder {
  // === Core mapped fields ===
  origin_address: string | null;
  destination_address: string | null;
  origin_contact_name: string | null;
  origin_contact_phone: string | null;
  dest_contact_name: string | null;
  dest_contact_phone: string | null;
  weight_kg: number | null;
  volume_cbm: number | null;
  pieces: number | null;
  commodity: string | null;
  vehicle_type: string | null;
  pickup_date: string | null;
  delivery_date: string | null;
  special_instructions: string | null;
  priority: string;
  reference_number: string | null;

  // === ALL 15 Excel columns ===
  order_creation_date: string | null;
  circulation_date: string | null;
  plant: string | null;
  distributor: string | null;
  lpo_number: string | null;
  beer_order_number: string | null;
  udv_order_number: string | null;
  pallets: number | null;
  delivery_number_beer: string | null;
  delivery_number_udv: string | null;
  qty_beer: number | null;
  qty_udv: number | null;
  truck_plate: string | null;
  status: string | null;
  palletization: string | null;

  // === Derived fields ===
  trucks_needed: number;          // How many 30T trucks this order needs
  truck_capacity_kg: number;      // Standard truck capacity (30,000 kg)
  plant_full_name: string | null; // NI1 → "Nairobi (NI1)"

  // === Context fields ===
  region: string | null;          // Section context: COAST, NAIROBI, etc.
  sheet_date: string | null;      // Sheet name (usually the date)
  row_number: number;             // Excel row number for traceability

  // === Data quality ===
  missing_fields: string[];       // Which critical fields are missing
  data_warnings: string[];        // Issues found in this row

  // === Raw data - preserves ALL columns including unmapped ===
  raw_row: Record<string, unknown>;
}

// Section headers — these are region groupings, NOT data rows
// But we capture them as context for the orders that follow
const SECTION_PATTERNS: Record<string, RegExp> = {
  'COAST': /^coast(\s+\w+)?$/i,
  'NAIROBI': /^nairobi(\s+\w+)?$/i,
  'MOUNTAIN': /^mountain(\s+\w+)?$/i,
  'LAKE (NCD)': /^lake\s+.*n[ci]d/i,
  'LAKE (KSM)': /^lake\s+.*ksm/i,
  'LAKE': /^lake(\s+\w+)?$/i,
  'WESTERN': /^western(\s+\w+)?$/i,
  'CENTRAL': /^central(\s+\w+)?$/i,
  'RIFT': /^rift\s*(valley)?(\s+\w+)?$/i,
  'NYANZA': /^nyanza(\s+\w+)?$/i,
};

// Skip patterns — NOT data rows, NOT section headers
// NOTE: "PACKED" is NOT skipped — it's a section marker. Orders under PACKED are "not yet cleared"
const SKIP_ROW = /^(spills?\s*$|plan\s*$|post.*cut|eaml\s*$|total|sub\s*total|grand\s*total|\s*$)/i;

// PACKED is a special section — orders under it are not yet cleared (intentionally incomplete)
const PACKED_PATTERN = /^packed\s*$/i;

// Column name patterns — maps various header names to our field
const COLUMN_PATTERNS: Record<string, RegExp> = {
  order_creation_date: /date.*order.*creation|order.*date|date.*created/i,
  circulation_date: /date.*circulation|circulation.*date/i,
  plant: /^plant$|^facility$|^loading.*facility$|^origin.*facility$/i,
  distributor: /^distributor$|^site$|^delivery.*site$|^client.*site$|^consignee$/i,
  lpo_number: /lpo.*no|lpo.*number|^lpo$/i,
  beer_order: /beer.*order|beer_order|order.*ref.*a$|^ref.*a$|^order.*a$/i,
  udv_order: /udv.*order|udv_order|order.*ref.*b$|^ref.*b$|^order.*b$/i,
  pallets: /^pallet/i,
  delivery_number_beer: /del.*no.*beer|delno.*beer|del.*#.*a$|^del.*a$/i,
  delivery_number_udv: /del.*no.*udv|delno.*udv|del.*#.*b$|^del.*b$/i,
  qty_beer: /qty.*beer|quantity.*beer|^qty.*a$|^qty_a$|^quantity.*a$/i,
  qty_udv: /qty.*udv|quantity.*udv|^qty.*b$|^qty_b$|^quantity.*b$/i,
  truck_allocation: /truck.*alloc|truck.*plate/i,
  status: /^status$/i,
  palletization: /palletiz|^packaging$/i,
  // Generic transport columns (fallback)
  origin: /origin|from|pickup.*location|source|loading.*point/i,
  destination: /destination|to|delivery.*location/i,
  weight: /weight|tonnage|^kg$|^tons$/i,
  pieces_generic: /^pieces$|^qty$|^quantity$|^units$|^cases$|^crates$/i,
  order_number: /order.*no|order.*number|order.*#|^ref/i,
  commodity: /commodity|goods|product|description|cargo/i,
  pickup_date_generic: /pickup.*date|collection.*date|load.*date/i,
  delivery_date_generic: /delivery.*date|arrival.*date|^eta$/i,
  vehicle_type: /vehicle.*type|truck.*type/i,
  vehicle: /^vehicle$|^truck$/i,
  priority: /priority|urgency/i,
  lpo_generic: /^lpo$|purchase.*order|po.*no/i,
  contact_name: /contact.*name|receiver|recipient/i,
  contact_phone: /contact.*phone|phone|mobile|tel/i,
  notes: /notes|remarks|comments|instructions/i,
};

export class SmartParser {
  private readonly logger = new Logger(SmartParser.name);

  parse(buffer: Buffer): { orders: SmartParsedOrder[]; summary: Record<string, unknown> } {
    const workbook = XLSX.read(buffer, { type: 'buffer', cellDates: true });
    const allOrders: SmartParsedOrder[] = [];
    const sheetsProcessed: string[] = [];
    const allMissingData: { sheet: string; row: number; fields: string[] }[] = [];

    for (const sheetName of workbook.SheetNames) {
      const worksheet = workbook.Sheets[sheetName];
      const rows: unknown[][] = XLSX.utils.sheet_to_json(worksheet, {
        header: 1,
        defval: '',
        blankrows: false,
      });

      if (rows.length < 2) continue;

      // Find the header row
      let headerRowIdx = -1;
      let columnMap: Record<string, number> = {};
      let headerRow: unknown[] = [];

      for (let i = 0; i < Math.min(20, rows.length); i++) {
        const row = rows[i];
        if (!Array.isArray(row)) continue;

        const mapped = this.mapColumns(row);
        const matchCount = Object.keys(mapped).length;

        if (matchCount >= 3 && matchCount > Object.keys(columnMap).length) {
          headerRowIdx = i;
          columnMap = mapped;
          headerRow = row;
        }
      }

      if (headerRowIdx === -1) {
        this.logger.warn(`Sheet "${sheetName}": no header row found, skipping`);
        continue;
      }

      sheetsProcessed.push(sheetName);

      // Build full header names for raw_row mapping
      const headerNames: string[] = headerRow.map((h, i) => {
        const name = String(h || '').trim();
        return name || `column_${i}`;
      });

      this.logger.log(
        `Sheet "${sheetName}": header at row ${headerRowIdx}, ` +
        `${Object.keys(columnMap).length} mapped, ${headerNames.filter(h => !h.startsWith('column_')).length} total columns`,
      );

      // Track current section/region context
      let currentRegion: string | null = null;
      let isPacked = false; // Orders under PACKED section are "not yet cleared"

      // Process ALL rows after the header
      for (let i = headerRowIdx + 1; i < rows.length; i++) {
        const row = rows[i];
        if (!Array.isArray(row)) continue;

        // Get the distributor/destination value (fall back to plant/origin if needed)
        const distCol = columnMap['distributor'] ?? columnMap['destination'] ?? columnMap['plant'] ?? columnMap['origin'];
        const distVal = distCol !== undefined ? String(row[distCol] || '').trim() : '';

        if (!distVal) continue;

        // Check for PACKED section — orders under this are not yet cleared
        if (PACKED_PATTERN.test(distVal)) {
          isPacked = true;
          continue;
        }

        // Check if this is a section header — capture as context, don't skip
        const section = this.matchSection(distVal);
        if (section) {
          currentRegion = section;
          isPacked = false; // New region section resets packed state
          continue; // Section headers are NOT orders
        }

        // Skip non-data rows (totals, spills, plan markers)
        if (SKIP_ROW.test(distVal)) continue;

        // Import ALL rows — even incomplete ones. User can edit before approving.
        // Only section headers and totals are skipped (handled above).

        // Build raw_row with ALL columns preserved
        const rawRow: Record<string, unknown> = {};
        for (let col = 0; col < row.length; col++) {
          const val = row[col];
          if (val !== '' && val !== null && val !== undefined) {
            rawRow[headerNames[col] || `column_${col}`] = val;
          }
        }

        const order = this.extractOrder(row, columnMap, sheetName, currentRegion, i + 1, rawRow, isPacked);
        if (order) {
          allOrders.push(order);
          if (order.missing_fields.length > 0) {
            allMissingData.push({
              sheet: sheetName,
              row: i + 1,
              fields: order.missing_fields,
            });
          }
        }
      }
    }

    // Build summary
    let totalQtyBeer = 0;
    let totalQtyUdv = 0;
    let totalPallets = 0;
    let totalWeightKg = 0;
    const origins = new Set<string>();
    const destinations = new Set<string>();
    const regions = new Set<string>();
    const statuses: Record<string, number> = {};
    const truckPlates = new Set<string>();
    const ordersWithMissingData = allOrders.filter(o => o.missing_fields.length > 0);

    for (const o of allOrders) {
      if (o.qty_beer) totalQtyBeer += o.qty_beer;
      if (o.qty_udv) totalQtyUdv += o.qty_udv;
      if (o.pallets) totalPallets += o.pallets;
      if (o.weight_kg) totalWeightKg += o.weight_kg;
      if (o.plant) origins.add(o.plant);
      if (o.distributor) destinations.add(o.distributor);
      if (o.region) regions.add(o.region);
      if (o.truck_plate) truckPlates.add(o.truck_plate);
      if (o.status) {
        statuses[o.status] = (statuses[o.status] || 0) + 1;
      }
    }

    this.logger.log(`Smart parser: ${allOrders.length} orders from ${sheetsProcessed.length} sheets`);
    if (ordersWithMissingData.length > 0) {
      this.logger.warn(`${ordersWithMissingData.length} orders have missing data`);
    }

    return {
      orders: allOrders,
      summary: {
        total_orders: allOrders.length,
        total_qty_beer: totalQtyBeer,
        total_qty_udv: totalQtyUdv,
        total_pallets: totalPallets,
        total_weight_kg: totalWeightKg || null,
        unique_origins: Array.from(origins),
        unique_destinations: Array.from(destinations),
        unique_regions: Array.from(regions),
        unique_trucks: Array.from(truckPlates),
        truck_count: truckPlates.size,
        status_breakdown: statuses,
        sheets_processed: sheetsProcessed,
        orders_with_missing_data: ordersWithMissingData.length,
        missing_data_details: allMissingData.slice(0, 50), // First 50 for review
        warnings: ordersWithMissingData.length > 0
          ? [`${ordersWithMissingData.length} orders have missing critical data — review required`]
          : [],
      },
    };
  }

  private matchSection(value: string): string | null {
    for (const [label, pattern] of Object.entries(SECTION_PATTERNS)) {
      if (pattern.test(value)) return label;
    }
    return null;
  }

  private mapColumns(headerRow: unknown[]): Record<string, number> {
    const mapped: Record<string, number> = {};

    for (let col = 0; col < headerRow.length; col++) {
      const header = String(headerRow[col] || '').trim();
      if (!header) continue;

      for (const [field, pattern] of Object.entries(COLUMN_PATTERNS)) {
        if (pattern.test(header) && !(field in mapped)) {
          mapped[field] = col;
          break;
        }
      }
    }

    return mapped;
  }

  private extractOrder(
    row: unknown[],
    columnMap: Record<string, number>,
    sheetName: string,
    region: string | null,
    excelRow: number,
    rawRow: Record<string, unknown>,
    isPacked = false,
  ): SmartParsedOrder | null {
    const get = (field: string): string => {
      const col = columnMap[field];
      if (col === undefined) return '';
      const val = row[col];
      if (val === null || val === undefined) return '';
      if (val instanceof Date) return val.toISOString().split('T')[0];
      return String(val).trim();
    };

    const getNum = (field: string): number | null => {
      const val = get(field);
      if (!val) return null;
      const num = parseFloat(val.replace(/[,\s]/g, ''));
      return isNaN(num) ? null : num;
    };

    // === Extract ALL 15 columns ===
    const orderCreationDate = get('order_creation_date') || get('pickup_date_generic');
    const circulationDate = get('circulation_date') || get('delivery_date_generic');
    const plant = get('plant') || get('origin');
    const distributor = get('distributor') || get('destination');
    const lpoNumber = get('lpo_number') || get('lpo_generic') || get('order_number');
    const beerOrder = get('beer_order');
    const udvOrder = get('udv_order');
    const palletsVal = getNum('pallets');
    const delNoBeer = get('delivery_number_beer');
    const delNoUdv = get('delivery_number_udv');
    const qtyBeer = getNum('qty_beer');
    const qtyUdv = getNum('qty_udv');
    const truckAllocation = get('truck_allocation') || get('vehicle');
    const status = get('status');
    const palletization = get('palletization');

    // Must have at least a distributor to be a valid order
    if (!distributor) return null;

    // === Compute derived fields ===
    const totalPieces = (qtyBeer || 0) + (qtyUdv || 0);

    let commodity: string | null = null;
    if (qtyBeer && qtyUdv) commodity = 'Mixed Goods';
    else if (qtyBeer) commodity = 'Goods A';
    else if (qtyUdv) commodity = 'Goods B';

    // Weight calculation from case quantities
    // EABL/KBL: 1 beer case (12x500ml) ≈ 11 kg, 1 UDV case ≈ 9 kg
    // All trucks are 30-tonne capacity
    const BEER_CASE_KG = 11;
    const UDV_CASE_KG = 9;
    const TRUCK_CAPACITY_KG = 30000; // All trucks are 30 tonnes
    let weightKg = getNum('weight');
    if (weightKg === null && (qtyBeer || qtyUdv)) {
      weightKg = Math.round(((qtyBeer || 0) * BEER_CASE_KG) + ((qtyUdv || 0) * UDV_CASE_KG));
    }

    // Calculate how many 30T trucks this order needs
    const trucksNeeded = weightKg ? Math.ceil(weightKg / TRUCK_CAPACITY_KG) : 1;

    const priority = status?.toLowerCase() === 'packed' ? 'LOW' : 'NORMAL';

    // Facility code mapping
    const FACILITY_NAMES: Record<string, string> = {
      'NI1': 'Nairobi Warehouse',
      'KBS2': 'Kisumu Depot',
    };
    const plantFullName = plant ? (FACILITY_NAMES[plant.toUpperCase()] || plant) : null;

    // === Track missing critical data ===
    const missingFields: string[] = [];
    const dataWarnings: string[] = [];

    if (!plant) missingFields.push('FACILITY');
    if (!lpoNumber) missingFields.push('LPO NO.');
    if (!orderCreationDate) missingFields.push('ORDER DATE');
    if (!circulationDate) missingFields.push('DELIVERY DATE');
    // TRUCK ALLOCATION is NOT missing — system will auto-allocate
    if (!status) missingFields.push('STATUS');
    if (qtyBeer === null && qtyUdv === null) missingFields.push('QTY (A or B)');
    if (!beerOrder && !udvOrder) missingFields.push('ORDER REF (A or B)');
    if (!delNoBeer && !delNoUdv) missingFields.push('DELIVERY NUMBER (A or B)');

    // PACKED section — order not yet cleared, intentionally incomplete
    if (isPacked) {
      dataWarnings.push('NOT CLEARED — order is under PACKED section (not yet dispatched by client)');
      // Missing delivery numbers are expected for packed orders
      const idx = missingFields.indexOf('DELIVERY NUMBER (A or B)');
      if (idx >= 0) missingFields.splice(idx, 1);
    }

    // Data quality warnings
    if (totalPieces === 0 && !isPacked) dataWarnings.push('Zero quantity — verify order is real');
    if (status && !['LOADED', 'PACKED', 'DISPATCHED', 'DELIVERED', 'PENDING'].includes(status.toUpperCase())) {
      dataWarnings.push(`Unusual status: "${status}"`);
    }
    if (truckAllocation && !/^K[A-Z]{2}/i.test(truckAllocation)) {
      dataWarnings.push(`Truck plate may be invalid: "${truckAllocation}"`);
    }

    return {
      // Core mapped — with proper business meaning
      origin_address: plantFullName || plant || null,   // Loading location
      destination_address: distributor || null,          // Delivery destination
      origin_contact_name: get('contact_name') || null,
      origin_contact_phone: get('contact_phone') || null,
      dest_contact_name: null,
      dest_contact_phone: null,
      weight_kg: weightKg,                              // Calculated from qty × case weight
      volume_cbm: null,
      pieces: totalPieces || null,                      // Total cases (beer + UDV)
      commodity,
      vehicle_type: 'DRY_VAN',                          // All trucks are 30T dry vans
      pickup_date: orderCreationDate || null,            // When order came in
      delivery_date: circulationDate || null,            // Circulation date
      special_instructions: [palletization, get('notes')].filter(Boolean).join('; ') || null,
      priority,
      reference_number: lpoNumber || null,
      trucks_needed: trucksNeeded,                       // How many 30T trucks needed
      truck_capacity_kg: TRUCK_CAPACITY_KG,

      // ALL 15 Excel columns — exact values
      order_creation_date: orderCreationDate || null,    // When the order came
      circulation_date: circulationDate || null,         // Circulation date
      plant: plant || null,                              // Loading point (NI1/KBS2)
      plant_full_name: plantFullName,                    // Full name (Nairobi/Kisumu)
      distributor: distributor || null,                  // Delivery destination
      lpo_number: lpoNumber || null,
      beer_order_number: beerOrder || null,
      udv_order_number: udvOrder || null,
      pallets: palletsVal,
      delivery_number_beer: delNoBeer || null,
      delivery_number_udv: delNoUdv || null,
      qty_beer: qtyBeer,
      qty_udv: qtyUdv,
      truck_plate: truckAllocation || null,             // From Excel — suggestion only, system auto-allocates
      status: status || null,                            // LOADED, PACKED, etc.
      palletization: palletization || null,

      // Context
      region: region || null,
      sheet_date: sheetName,
      row_number: excelRow,

      // Data quality
      missing_fields: missingFields,
      data_warnings: dataWarnings,

      // Raw - ALL columns preserved
      raw_row: rawRow,
    };
  }
}

results matching ""

    No results matching ""