Multimodel_Rag / scripts /debug_xlsx.py
Dhrumil Parikh
deploy GeminiRAG
cdc55f4
Raw
History Blame Contribute Delete
2.52 kB
"""Dev utility — debug XLSX/CSV file processing step by step."""
import sys
sys.path.insert(0, r'C:\Users\Dhrumil.parikh\OneDrive - Taazaa Tech Pvt Ltd\Desktop\playbook_final\geminirag')
from app.models.db import get_engine
from sqlalchemy import text
# Get one of the problem XLSX files
engine = get_engine()
with engine.connect() as conn:
rows = conn.execute(text(
"SELECT filename, file_path FROM jobs WHERE filename IN ('SUM-2025-3241.xlsx', 'INV-2025-4291-GT.xlsx', 'CTR-2026-3042.xlsx') LIMIT 3"
)).fetchall()
for r in rows:
print(f"\n=== {r[0]} ===")
try:
import openpyxl
wb = openpyxl.load_workbook(r[1], read_only=True, data_only=True)
print(f"Sheets: {wb.sheetnames}")
for sheet_name in wb.sheetnames[:1]:
ws = wb[sheet_name]
all_rows = list(ws.iter_rows())
print(f"Sheet '{sheet_name}': {len(all_rows)} rows, {len(list(ws.iter_cols())) if all_rows else 0} cols")
except Exception as e:
print(f" Error: {e}")
# Build the prompt for one file and measure it
print("\n--- Testing SUM file text length ---")
from app.processors.xlsx_proc import XLSXProcessor
class MockJob:
def __init__(self, path, filename):
self.file_path = path
self.filename = filename
self.id = "test"
self.user_id = "test"
self.file_type = "xlsx"
class MockSettings:
GROQ_API_KEY = "test"
GROQ_MODEL = "test"
GROQ_VISION_MODEL = "test"
for r in rows[:1]:
job = MockJob(r[1], r[0])
proc = XLSXProcessor(job=job, settings=MockSettings())
text = proc.extract()
print(f"Full extracted text length: {len(text)} chars")
print(f"After 1500-char truncation: {min(len(text), 1500)} chars")
trunc = text[:1500]
# Build actual prompt
prompt = f"""You are a data analyst. Analyse the following spreadsheet data and return ONLY valid JSON.
No preamble, no markdown code blocks, just raw JSON.
Return this exact structure:
{{
"title": "spreadsheet title or filename",
"summary": "2-3 sentence summary of what this data contains",
"sheets": ["list of sheet names found"],
"column_descriptions": {{"column_name": "what it likely represents"}},
"key_insights": ["notable patterns, max/min values, trends noticed"],
"row_count": 0
}}
Spreadsheet data:
{trunc}
"""
print(f"Full prompt char length: {len(prompt)}")
print(f"Prompt first 300 chars:")
print(prompt[:300])
print(f"\nLast 300 chars of prompt:")
print(prompt[-300:])