"""Dev utility — debug XLSX/CSV file processing step by step.""" import sys sys.path.insert(0, r'C:\Users\Dhrumil.parikh\OneDrive - Taazaa Tech Pvt Ltd\Desktop\playbook_final\geminirag') from app.models.db import get_engine from sqlalchemy import text # Get one of the problem XLSX files engine = get_engine() with engine.connect() as conn: rows = conn.execute(text( "SELECT filename, file_path FROM jobs WHERE filename IN ('SUM-2025-3241.xlsx', 'INV-2025-4291-GT.xlsx', 'CTR-2026-3042.xlsx') LIMIT 3" )).fetchall() for r in rows: print(f"\n=== {r[0]} ===") try: import openpyxl wb = openpyxl.load_workbook(r[1], read_only=True, data_only=True) print(f"Sheets: {wb.sheetnames}") for sheet_name in wb.sheetnames[:1]: ws = wb[sheet_name] all_rows = list(ws.iter_rows()) print(f"Sheet '{sheet_name}': {len(all_rows)} rows, {len(list(ws.iter_cols())) if all_rows else 0} cols") except Exception as e: print(f" Error: {e}") # Build the prompt for one file and measure it print("\n--- Testing SUM file text length ---") from app.processors.xlsx_proc import XLSXProcessor class MockJob: def __init__(self, path, filename): self.file_path = path self.filename = filename self.id = "test" self.user_id = "test" self.file_type = "xlsx" class MockSettings: GROQ_API_KEY = "test" GROQ_MODEL = "test" GROQ_VISION_MODEL = "test" for r in rows[:1]: job = MockJob(r[1], r[0]) proc = XLSXProcessor(job=job, settings=MockSettings()) text = proc.extract() print(f"Full extracted text length: {len(text)} chars") print(f"After 1500-char truncation: {min(len(text), 1500)} chars") trunc = text[:1500] # Build actual prompt prompt = f"""You are a data analyst. Analyse the following spreadsheet data and return ONLY valid JSON. No preamble, no markdown code blocks, just raw JSON. Return this exact structure: {{ "title": "spreadsheet title or filename", "summary": "2-3 sentence summary of what this data contains", "sheets": ["list of sheet names found"], "column_descriptions": {{"column_name": "what it likely represents"}}, "key_insights": ["notable patterns, max/min values, trends noticed"], "row_count": 0 }} Spreadsheet data: {trunc} """ print(f"Full prompt char length: {len(prompt)}") print(f"Prompt first 300 chars:") print(prompt[:300]) print(f"\nLast 300 chars of prompt:") print(prompt[-300:])