Spaces:

Xxa1
/

enterprise_qa

Runtime error

enterprise_qa / server /data_generator.py

krishthukral

Final cleanup deployment

acc61a7 3 months ago

3.93 kB

	import os
	import csv
	import json
	import re
	from typing import List, Dict, Any
	from pypdf import PdfReader
	from uuid import uuid4

	def parse_pdf_text(file_path: str, max_pages: int = 5) -> str:
	"""Extracts text from the first few pages of a PDF."""
	try:
	reader = PdfReader(file_path)
	text = ""
	# The first few pages of a 10-Q/10-K often contain TOC and risk factors.
	# But for quantitative QA, MD&A or Financial Statements are best.
	# We'll just grab the first available text for demonstration.
	for i, page in enumerate(reader.pages):
	if i >= max_pages:
	break
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"
	return text
	except Exception as e:
	return f"Error reading PDF: {str(e)}"

	def generate_tasks_from_filing(api_key: str, file_path: str, num_tasks: int = 3) -> List[Dict[str, Any]]:
	"""Uses Mistral API to generate QA tasks based on the provided filing."""
	from server.mistral_client import _call_mistral

	# 1. Extract text from the PDF
	filing_text = parse_pdf_text(file_path, max_pages=6)

	if filing_text.startswith("Error"):
	raise ValueError(filing_text)

	# Chunk the text to fit into prompt
	text_chunk = filing_text[:8000] # Safe token limit

	# 2. Prompt Mistral
	sys_prompt = f"""You are a financial analyst extracting quantitative data from SEC Filings to create QA tests for an AI agent.
	Below is an excerpt from a corporate filing:
	---
	{text_chunk}
	---

	Generate exactly {num_tasks} quantitative QA tasks based ONLY on the numbers and data found in the text above.
	Output a valid JSON array of objects. Do not include markdown formatting or code fences.
	Each object must have these exact keys:
	- "domain": "Finance" or "Operations"
	- "prompt": The problem or question targeting a specific calculation (e.g. margins, growth, terminal value) using figures from the text. Format final request clearly (e.g. "Format as US$ million").
	- "rubric": scoring criteria like "criteria: States the correct value; criteria: uses correct formula"
	- "gold_response": the exact expected answer in one short sentence.

	Output only the JSON array."""

	try:
	raw = _call_mistral(api_key, sys_prompt)
	raw = re.sub(r"^```(?:json)?\s*", "", raw)
	raw = re.sub(r"\s```\s$", "", raw)
	data = json.loads(raw.strip())

	if not isinstance(data, list):
	data = [data]

	tasks = []
	for item in data:
	task = {
	"task_id": "dynamic_filing_" + str(uuid4())[:8],
	"task_name": "Filing_QA_" + str(uuid4())[:4],
	"world_id": "world_filing",
	"domain": item.get("domain", "Finance"),
	"prompt": item.get("prompt", ""),
	"task_input_files": "",
	"expected_output": "message_in_console",
	"gold_response": item.get("gold_response", ""),
	"gold_response_type": "text",
	"rubric": item.get("rubric", "")
	}
	tasks.append(task)
	return tasks
	except Exception as e:
	raise ValueError(f"Failed to generate tasks via Mistral: {e}")

	def save_tasks_to_csv(tasks: List[Dict[str, Any]], csv_path: str):
	"""Appends new tasks to the existing data.csv."""
	file_exists = os.path.isfile(csv_path)
	fields = [
	"task_id", "task_name", "world_id", "domain", "prompt",
	"task_input_files", "expected_output", "gold_response",
	"gold_response_type", "rubric"
	]

	with open(csv_path, mode='a', newline='', encoding="utf-8") as f:
	writer = csv.DictWriter(f, fieldnames=fields)
	if not file_exists:
	writer.writeheader()
	for task in tasks:
	writer.writerow(task)