enterprise_qa / server /data_generator.py
krishthukral
Final cleanup deployment
acc61a7
import os
import csv
import json
import re
from typing import List, Dict, Any
from pypdf import PdfReader
from uuid import uuid4
def parse_pdf_text(file_path: str, max_pages: int = 5) -> str:
"""Extracts text from the first few pages of a PDF."""
try:
reader = PdfReader(file_path)
text = ""
# The first few pages of a 10-Q/10-K often contain TOC and risk factors.
# But for quantitative QA, MD&A or Financial Statements are best.
# We'll just grab the first available text for demonstration.
for i, page in enumerate(reader.pages):
if i >= max_pages:
break
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text
except Exception as e:
return f"Error reading PDF: {str(e)}"
def generate_tasks_from_filing(api_key: str, file_path: str, num_tasks: int = 3) -> List[Dict[str, Any]]:
"""Uses Mistral API to generate QA tasks based on the provided filing."""
from server.mistral_client import _call_mistral
# 1. Extract text from the PDF
filing_text = parse_pdf_text(file_path, max_pages=6)
if filing_text.startswith("Error"):
raise ValueError(filing_text)
# Chunk the text to fit into prompt
text_chunk = filing_text[:8000] # Safe token limit
# 2. Prompt Mistral
sys_prompt = f"""You are a financial analyst extracting quantitative data from SEC Filings to create QA tests for an AI agent.
Below is an excerpt from a corporate filing:
---
{text_chunk}
---
Generate exactly {num_tasks} quantitative QA tasks based ONLY on the numbers and data found in the text above.
Output a valid JSON array of objects. Do not include markdown formatting or code fences.
Each object must have these exact keys:
- "domain": "Finance" or "Operations"
- "prompt": The problem or question targeting a specific calculation (e.g. margins, growth, terminal value) using figures from the text. Format final request clearly (e.g. "Format as US$ million").
- "rubric": scoring criteria like "criteria: States the correct value; criteria: uses correct formula"
- "gold_response": the exact expected answer in one short sentence.
Output only the JSON array."""
try:
raw = _call_mistral(api_key, sys_prompt)
raw = re.sub(r"^```(?:json)?\s*", "", raw)
raw = re.sub(r"\s*```\s*$", "", raw)
data = json.loads(raw.strip())
if not isinstance(data, list):
data = [data]
tasks = []
for item in data:
task = {
"task_id": "dynamic_filing_" + str(uuid4())[:8],
"task_name": "Filing_QA_" + str(uuid4())[:4],
"world_id": "world_filing",
"domain": item.get("domain", "Finance"),
"prompt": item.get("prompt", ""),
"task_input_files": "",
"expected_output": "message_in_console",
"gold_response": item.get("gold_response", ""),
"gold_response_type": "text",
"rubric": item.get("rubric", "")
}
tasks.append(task)
return tasks
except Exception as e:
raise ValueError(f"Failed to generate tasks via Mistral: {e}")
def save_tasks_to_csv(tasks: List[Dict[str, Any]], csv_path: str):
"""Appends new tasks to the existing data.csv."""
file_exists = os.path.isfile(csv_path)
fields = [
"task_id", "task_name", "world_id", "domain", "prompt",
"task_input_files", "expected_output", "gold_response",
"gold_response_type", "rubric"
]
with open(csv_path, mode='a', newline='', encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fields)
if not file_exists:
writer.writeheader()
for task in tasks:
writer.writerow(task)