File size: 2,524 Bytes
cdc55f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
"""Dev utility — debug XLSX/CSV file processing step by step."""
import sys
sys.path.insert(0, r'C:\Users\Dhrumil.parikh\OneDrive - Taazaa Tech Pvt Ltd\Desktop\playbook_final\geminirag')

from app.models.db import get_engine
from sqlalchemy import text

# Get one of the problem XLSX files
engine = get_engine()
with engine.connect() as conn:
    rows = conn.execute(text(
        "SELECT filename, file_path FROM jobs WHERE filename IN ('SUM-2025-3241.xlsx', 'INV-2025-4291-GT.xlsx', 'CTR-2026-3042.xlsx') LIMIT 3"
    )).fetchall()

for r in rows:
    print(f"\n=== {r[0]} ===")
    try:
        import openpyxl
        wb = openpyxl.load_workbook(r[1], read_only=True, data_only=True)
        print(f"Sheets: {wb.sheetnames}")
        for sheet_name in wb.sheetnames[:1]:
            ws = wb[sheet_name]
            all_rows = list(ws.iter_rows())
            print(f"Sheet '{sheet_name}': {len(all_rows)} rows, {len(list(ws.iter_cols())) if all_rows else 0} cols")
    except Exception as e:
        print(f"  Error: {e}")

# Build the prompt for one file and measure it
print("\n--- Testing SUM file text length ---")
from app.processors.xlsx_proc import XLSXProcessor

class MockJob:
    def __init__(self, path, filename):
        self.file_path = path
        self.filename = filename
        self.id = "test"
        self.user_id = "test"
        self.file_type = "xlsx"

class MockSettings:
    GROQ_API_KEY = "test"
    GROQ_MODEL = "test"
    GROQ_VISION_MODEL = "test"

for r in rows[:1]:
    job = MockJob(r[1], r[0])
    proc = XLSXProcessor(job=job, settings=MockSettings())
    text = proc.extract()
    print(f"Full extracted text length: {len(text)} chars")
    print(f"After 1500-char truncation: {min(len(text), 1500)} chars")
    trunc = text[:1500]

    # Build actual prompt
    prompt = f"""You are a data analyst. Analyse the following spreadsheet data and return ONLY valid JSON.
No preamble, no markdown code blocks, just raw JSON.

Return this exact structure:
{{
  "title": "spreadsheet title or filename",
  "summary": "2-3 sentence summary of what this data contains",
  "sheets": ["list of sheet names found"],
  "column_descriptions": {{"column_name": "what it likely represents"}},
  "key_insights": ["notable patterns, max/min values, trends noticed"],
  "row_count": 0
}}

Spreadsheet data:
{trunc}
"""
    print(f"Full prompt char length: {len(prompt)}")
    print(f"Prompt first 300 chars:")
    print(prompt[:300])
    print(f"\nLast 300 chars of prompt:")
    print(prompt[-300:])