Spaces:

dhrumilparikh
/

Multimodel_Rag

Sleeping

Dhrumil Parikh

deploy GeminiRAG

cdc55f4 28 days ago

2.52 kB

	"""Dev utility — debug XLSX/CSV file processing step by step."""
	import sys
	sys.path.insert(0, r'C:\Users\Dhrumil.parikh\OneDrive - Taazaa Tech Pvt Ltd\Desktop\playbook_final\geminirag')

	from app.models.db import get_engine
	from sqlalchemy import text

	# Get one of the problem XLSX files
	engine = get_engine()
	with engine.connect() as conn:
	rows = conn.execute(text(
	"SELECT filename, file_path FROM jobs WHERE filename IN ('SUM-2025-3241.xlsx', 'INV-2025-4291-GT.xlsx', 'CTR-2026-3042.xlsx') LIMIT 3"
	)).fetchall()

	for r in rows:
	print(f"\n=== {r[0]} ===")
	try:
	import openpyxl
	wb = openpyxl.load_workbook(r[1], read_only=True, data_only=True)
	print(f"Sheets: {wb.sheetnames}")
	for sheet_name in wb.sheetnames[:1]:
	ws = wb[sheet_name]
	all_rows = list(ws.iter_rows())
	print(f"Sheet '{sheet_name}': {len(all_rows)} rows, {len(list(ws.iter_cols())) if all_rows else 0} cols")
	except Exception as e:
	print(f" Error: {e}")

	# Build the prompt for one file and measure it
	print("\n--- Testing SUM file text length ---")
	from app.processors.xlsx_proc import XLSXProcessor

	class MockJob:
	def __init__(self, path, filename):
	self.file_path = path
	self.filename = filename
	self.id = "test"
	self.user_id = "test"
	self.file_type = "xlsx"

	class MockSettings:
	GROQ_API_KEY = "test"
	GROQ_MODEL = "test"
	GROQ_VISION_MODEL = "test"

	for r in rows[:1]:
	job = MockJob(r[1], r[0])
	proc = XLSXProcessor(job=job, settings=MockSettings())
	text = proc.extract()
	print(f"Full extracted text length: {len(text)} chars")
	print(f"After 1500-char truncation: {min(len(text), 1500)} chars")
	trunc = text[:1500]

	# Build actual prompt
	prompt = f"""You are a data analyst. Analyse the following spreadsheet data and return ONLY valid JSON.
	No preamble, no markdown code blocks, just raw JSON.

	Return this exact structure:
	{{
	"title": "spreadsheet title or filename",
	"summary": "2-3 sentence summary of what this data contains",
	"sheets": ["list of sheet names found"],
	"column_descriptions": {{"column_name": "what it likely represents"}},
	"key_insights": ["notable patterns, max/min values, trends noticed"],
	"row_count": 0
	}}

	Spreadsheet data:
	{trunc}
	"""
	print(f"Full prompt char length: {len(prompt)}")
	print(f"Prompt first 300 chars:")
	print(prompt[:300])
	print(f"\nLast 300 chars of prompt:")
	print(prompt[-300:])