import os
import fitz  # PyMuPDF
import tempfile
import requests
from typing import List
from fastapi import FastAPI
from pydantic import BaseModel, Field
from agents import (
    Agent,
    Runner,
    AsyncOpenAI,
    OpenAIChatCompletionsModel,
    set_tracing_disabled,
    SQLiteSession,
    ModelSettings
)
from dotenv import load_dotenv

# ---------------- Setup ----------------
load_dotenv()
set_tracing_disabled(True)

API_KEY = os.getenv("GEMINI_API_KEY")

client_provider = AsyncOpenAI(
    api_key=API_KEY,
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
)

Model = OpenAIChatCompletionsModel(
    model="gemini-2.0-flash",
    openai_client=client_provider,
)

app = FastAPI(title="Student Data Extractor API")

# ---------------- Schemas ----------------
class StudentRecord(BaseModel):
    name: str = Field(default="", description="Student's name")
    roll_no: str = Field(default="", description="Roll number")
    class_name: str = Field(default="", description="Class level")
    section: str = Field(default="", description="Section letter")
    mobile: str = Field(default="", description="Mobile number")

class ExtractResponse(BaseModel):
    students: List[StudentRecord] = Field(default_factory=list)

class PDFRequest(BaseModel):
    pdfUrl: str

# ---------------- Agent Definition ----------------
student_agent = Agent(
    name="StudentPDFExtractor",
    model=Model,
    instructions="""
You are a precise data extraction agent. Read the provided text extracted from a student report PDF and return structured student data.

The PDF text typically includes:
Student Data Report - hyderabad sspo
Generated on: 10/24/2025
Name Roll No. Class Section Mobile
John Doe 05738999 12 A 09338488484848388

Ignore headers like 'Student Data Report' and 'Generated on:'.
Return all students in JSON with this schema:
{
  "students": [
    {
      "name": "string",
      "roll_no": "string",
      "class_name": "string",
      "section": "string",
      "mobile": "string"
    }
  ]
}
""",
    output_type=ExtractResponse,
    model_settings=ModelSettings(temperature=0.2, top_p=0.85)
)

runner = Runner()

# ---------------- Helper Functions ----------------
def download_and_extract_text(pdf_url: str) -> str:
    """Downloads a PDF from Cloudinary and extracts text"""
    print(f"📥 Downloading PDF from: {pdf_url}")
    response = requests.get(pdf_url)
    response.raise_for_status()

    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
        tmp.write(response.content)
        tmp_path = tmp.name

    doc = fitz.open(tmp_path)
    text = "\n".join(page.get_text("text") for page in doc)
    doc.close()
    os.remove(tmp_path)
    print("✅ PDF text extracted successfully.")
    return text


async def extract_from_text(text: str) -> dict:
    """Runs the agent to extract structured data"""
    print(f"📄 Extracting from {len(text)} characters...")
    resp = await runner.run(
        student_agent,
        text,  # ✅ plain text only
        session=SQLiteSession("student_trace.db")
    )

    if hasattr(resp, "output"):
        return resp.output.model_dump()
    elif hasattr(resp, "final_output"):
        return resp.final_output.model_dump()

    return {"students": []}

# ---------------- FastAPI Endpoint ----------------
@app.post("/extract-student")
async def extract_student(req: PDFRequest):
    """
    Accepts a Cloudinary PDF URL,
    downloads it, extracts text, and returns structured student data.
    """
    try:
        text = download_and_extract_text(req.pdfUrl)
        structured = await extract_from_text(text)
        return {
            "success": True,
            "pdfUrl": req.pdfUrl,
            "structured": structured,
            "raw_text_preview": text[:800]  # trimmed preview
        }
    except Exception as e:
        return {"success": False, "error": str(e)}