Spaces:

Xxa1
/

enterprise_qa

Runtime error

File size: 3,930 Bytes

acc61a7

import os
import csv
import json
import re
from typing import List, Dict, Any
from pypdf import PdfReader
from uuid import uuid4

def parse_pdf_text(file_path: str, max_pages: int = 5) -> str:
    """Extracts text from the first few pages of a PDF."""
    try:
        reader = PdfReader(file_path)
        text = ""
        # The first few pages of a 10-Q/10-K often contain TOC and risk factors. 
        # But for quantitative QA, MD&A or Financial Statements are best.
        # We'll just grab the first available text for demonstration.
        for i, page in enumerate(reader.pages):
            if i >= max_pages:
                break
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
        return text
    except Exception as e:
        return f"Error reading PDF: {str(e)}"

def generate_tasks_from_filing(api_key: str, file_path: str, num_tasks: int = 3) -> List[Dict[str, Any]]:
    """Uses Mistral API to generate QA tasks based on the provided filing."""
    from server.mistral_client import _call_mistral
    
    # 1. Extract text from the PDF
    filing_text = parse_pdf_text(file_path, max_pages=6)
    
    if filing_text.startswith("Error"):
        raise ValueError(filing_text)
        
    # Chunk the text to fit into prompt
    text_chunk = filing_text[:8000] # Safe token limit
    
    # 2. Prompt Mistral
    sys_prompt = f"""You are a financial analyst extracting quantitative data from SEC Filings to create QA tests for an AI agent.
Below is an excerpt from a corporate filing:
---
{text_chunk}
---

Generate exactly {num_tasks} quantitative QA tasks based ONLY on the numbers and data found in the text above.
Output a valid JSON array of objects. Do not include markdown formatting or code fences.
Each object must have these exact keys:
- "domain": "Finance" or "Operations"
- "prompt": The problem or question targeting a specific calculation (e.g. margins, growth, terminal value) using figures from the text. Format final request clearly (e.g. "Format as US$ million").
- "rubric": scoring criteria like "criteria: States the correct value; criteria: uses correct formula"
- "gold_response": the exact expected answer in one short sentence.

Output only the JSON array."""
    
    try:
        raw = _call_mistral(api_key, sys_prompt)
        raw = re.sub(r"^```(?:json)?\s*", "", raw)
        raw = re.sub(r"\s*```\s*$", "", raw)
        data = json.loads(raw.strip())
        
        if not isinstance(data, list):
            data = [data]
            
        tasks = []
        for item in data:
            task = {
                "task_id": "dynamic_filing_" + str(uuid4())[:8],
                "task_name": "Filing_QA_" + str(uuid4())[:4],
                "world_id": "world_filing",
                "domain": item.get("domain", "Finance"),
                "prompt": item.get("prompt", ""),
                "task_input_files": "",
                "expected_output": "message_in_console",
                "gold_response": item.get("gold_response", ""),
                "gold_response_type": "text",
                "rubric": item.get("rubric", "")
            }
            tasks.append(task)
        return tasks
    except Exception as e:
        raise ValueError(f"Failed to generate tasks via Mistral: {e}")

def save_tasks_to_csv(tasks: List[Dict[str, Any]], csv_path: str):
    """Appends new tasks to the existing data.csv."""
    file_exists = os.path.isfile(csv_path)
    fields = [
        "task_id", "task_name", "world_id", "domain", "prompt", 
        "task_input_files", "expected_output", "gold_response", 
        "gold_response_type", "rubric"
    ]
    
    with open(csv_path, mode='a', newline='', encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fields)
        if not file_exists:
            writer.writeheader()
        for task in tasks:
            writer.writerow(task)