Spaces:
Runtime error
Runtime error
| import os | |
| import csv | |
| import json | |
| import re | |
| from typing import List, Dict, Any | |
| from pypdf import PdfReader | |
| from uuid import uuid4 | |
| def parse_pdf_text(file_path: str, max_pages: int = 5) -> str: | |
| """Extracts text from the first few pages of a PDF.""" | |
| try: | |
| reader = PdfReader(file_path) | |
| text = "" | |
| # The first few pages of a 10-Q/10-K often contain TOC and risk factors. | |
| # But for quantitative QA, MD&A or Financial Statements are best. | |
| # We'll just grab the first available text for demonstration. | |
| for i, page in enumerate(reader.pages): | |
| if i >= max_pages: | |
| break | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| return text | |
| except Exception as e: | |
| return f"Error reading PDF: {str(e)}" | |
| def generate_tasks_from_filing(api_key: str, file_path: str, num_tasks: int = 3) -> List[Dict[str, Any]]: | |
| """Uses Mistral API to generate QA tasks based on the provided filing.""" | |
| from server.mistral_client import _call_mistral | |
| # 1. Extract text from the PDF | |
| filing_text = parse_pdf_text(file_path, max_pages=6) | |
| if filing_text.startswith("Error"): | |
| raise ValueError(filing_text) | |
| # Chunk the text to fit into prompt | |
| text_chunk = filing_text[:8000] # Safe token limit | |
| # 2. Prompt Mistral | |
| sys_prompt = f"""You are a financial analyst extracting quantitative data from SEC Filings to create QA tests for an AI agent. | |
| Below is an excerpt from a corporate filing: | |
| --- | |
| {text_chunk} | |
| --- | |
| Generate exactly {num_tasks} quantitative QA tasks based ONLY on the numbers and data found in the text above. | |
| Output a valid JSON array of objects. Do not include markdown formatting or code fences. | |
| Each object must have these exact keys: | |
| - "domain": "Finance" or "Operations" | |
| - "prompt": The problem or question targeting a specific calculation (e.g. margins, growth, terminal value) using figures from the text. Format final request clearly (e.g. "Format as US$ million"). | |
| - "rubric": scoring criteria like "criteria: States the correct value; criteria: uses correct formula" | |
| - "gold_response": the exact expected answer in one short sentence. | |
| Output only the JSON array.""" | |
| try: | |
| raw = _call_mistral(api_key, sys_prompt) | |
| raw = re.sub(r"^```(?:json)?\s*", "", raw) | |
| raw = re.sub(r"\s*```\s*$", "", raw) | |
| data = json.loads(raw.strip()) | |
| if not isinstance(data, list): | |
| data = [data] | |
| tasks = [] | |
| for item in data: | |
| task = { | |
| "task_id": "dynamic_filing_" + str(uuid4())[:8], | |
| "task_name": "Filing_QA_" + str(uuid4())[:4], | |
| "world_id": "world_filing", | |
| "domain": item.get("domain", "Finance"), | |
| "prompt": item.get("prompt", ""), | |
| "task_input_files": "", | |
| "expected_output": "message_in_console", | |
| "gold_response": item.get("gold_response", ""), | |
| "gold_response_type": "text", | |
| "rubric": item.get("rubric", "") | |
| } | |
| tasks.append(task) | |
| return tasks | |
| except Exception as e: | |
| raise ValueError(f"Failed to generate tasks via Mistral: {e}") | |
| def save_tasks_to_csv(tasks: List[Dict[str, Any]], csv_path: str): | |
| """Appends new tasks to the existing data.csv.""" | |
| file_exists = os.path.isfile(csv_path) | |
| fields = [ | |
| "task_id", "task_name", "world_id", "domain", "prompt", | |
| "task_input_files", "expected_output", "gold_response", | |
| "gold_response_type", "rubric" | |
| ] | |
| with open(csv_path, mode='a', newline='', encoding="utf-8") as f: | |
| writer = csv.DictWriter(f, fieldnames=fields) | |
| if not file_exists: | |
| writer.writeheader() | |
| for task in tasks: | |
| writer.writerow(task) | |