import os import csv import json import re from typing import List, Dict, Any from pypdf import PdfReader from uuid import uuid4 def parse_pdf_text(file_path: str, max_pages: int = 5) -> str: """Extracts text from the first few pages of a PDF.""" try: reader = PdfReader(file_path) text = "" # The first few pages of a 10-Q/10-K often contain TOC and risk factors. # But for quantitative QA, MD&A or Financial Statements are best. # We'll just grab the first available text for demonstration. for i, page in enumerate(reader.pages): if i >= max_pages: break page_text = page.extract_text() if page_text: text += page_text + "\n" return text except Exception as e: return f"Error reading PDF: {str(e)}" def generate_tasks_from_filing(api_key: str, file_path: str, num_tasks: int = 3) -> List[Dict[str, Any]]: """Uses Mistral API to generate QA tasks based on the provided filing.""" from server.mistral_client import _call_mistral # 1. Extract text from the PDF filing_text = parse_pdf_text(file_path, max_pages=6) if filing_text.startswith("Error"): raise ValueError(filing_text) # Chunk the text to fit into prompt text_chunk = filing_text[:8000] # Safe token limit # 2. Prompt Mistral sys_prompt = f"""You are a financial analyst extracting quantitative data from SEC Filings to create QA tests for an AI agent. Below is an excerpt from a corporate filing: --- {text_chunk} --- Generate exactly {num_tasks} quantitative QA tasks based ONLY on the numbers and data found in the text above. Output a valid JSON array of objects. Do not include markdown formatting or code fences. Each object must have these exact keys: - "domain": "Finance" or "Operations" - "prompt": The problem or question targeting a specific calculation (e.g. margins, growth, terminal value) using figures from the text. Format final request clearly (e.g. "Format as US$ million"). - "rubric": scoring criteria like "criteria: States the correct value; criteria: uses correct formula" - "gold_response": the exact expected answer in one short sentence. Output only the JSON array.""" try: raw = _call_mistral(api_key, sys_prompt) raw = re.sub(r"^```(?:json)?\s*", "", raw) raw = re.sub(r"\s*```\s*$", "", raw) data = json.loads(raw.strip()) if not isinstance(data, list): data = [data] tasks = [] for item in data: task = { "task_id": "dynamic_filing_" + str(uuid4())[:8], "task_name": "Filing_QA_" + str(uuid4())[:4], "world_id": "world_filing", "domain": item.get("domain", "Finance"), "prompt": item.get("prompt", ""), "task_input_files": "", "expected_output": "message_in_console", "gold_response": item.get("gold_response", ""), "gold_response_type": "text", "rubric": item.get("rubric", "") } tasks.append(task) return tasks except Exception as e: raise ValueError(f"Failed to generate tasks via Mistral: {e}") def save_tasks_to_csv(tasks: List[Dict[str, Any]], csv_path: str): """Appends new tasks to the existing data.csv.""" file_exists = os.path.isfile(csv_path) fields = [ "task_id", "task_name", "world_id", "domain", "prompt", "task_input_files", "expected_output", "gold_response", "gold_response_type", "rubric" ] with open(csv_path, mode='a', newline='', encoding="utf-8") as f: writer = csv.DictWriter(f, fieldnames=fields) if not file_exists: writer.writeheader() for task in tasks: writer.writerow(task)