Spaces:

Tahasaif3
/

DataExtract

Sleeping

App Files Files Community

Tahasaif3 commited on Oct 25, 2025

Commit

8d736da

1 Parent(s): 8c03e3b

'code'

Browse files

Files changed (2) hide show

main.py +136 -0
requirements.txt +7 -0

main.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import os
+import fitz  # PyMuPDF
+import tempfile
+import requests
+from typing import List
+from fastapi import FastAPI
+from pydantic import BaseModel, Field
+from agents import (
+    Agent,
+    Runner,
+    AsyncOpenAI,
+    OpenAIChatCompletionsModel,
+    set_tracing_disabled,
+    SQLiteSession,
+    ModelSettings
+)
+from dotenv import load_dotenv
+# ---------------- Setup ----------------
+load_dotenv()
+set_tracing_disabled(True)
+API_KEY = os.getenv("GEMINI_API_KEY")
+client_provider = AsyncOpenAI(
+    api_key=API_KEY,
+    base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
+)
+Model = OpenAIChatCompletionsModel(
+    model="gemini-2.0-flash",
+    openai_client=client_provider,
+)
+app = FastAPI(title="Student Data Extractor API")
+# ---------------- Schemas ----------------
+class StudentRecord(BaseModel):
+    name: str = Field(default="", description="Student's name")
+    roll_no: str = Field(default="", description="Roll number")
+    class_name: str = Field(default="", description="Class level")
+    section: str = Field(default="", description="Section letter")
+    mobile: str = Field(default="", description="Mobile number")
+class ExtractResponse(BaseModel):
+    students: List[StudentRecord] = Field(default_factory=list)
+class PDFRequest(BaseModel):
+    pdfUrl: str
+# ---------------- Agent Definition ----------------
+student_agent = Agent(
+    name="StudentPDFExtractor",
+    model=Model,
+    instructions="""
+You are a precise data extraction agent. Read the provided text extracted from a student report PDF and return structured student data.
+The PDF text typically includes:
+Student Data Report - hyderabad sspo
+Generated on: 10/24/2025
+Name Roll No. Class Section Mobile
+John Doe 05738999 12 A 09338488484848388
+Ignore headers like 'Student Data Report' and 'Generated on:'.
+Return all students in JSON with this schema:
+{
+  "students": [
+    {
+      "name": "string",
+      "roll_no": "string",
+      "class_name": "string",
+      "section": "string",
+      "mobile": "string"
+    }
+  ]
+}
+""",
+    output_type=ExtractResponse,
+    model_settings=ModelSettings(temperature=0.2, top_p=0.85)
+)
+runner = Runner()
+# ---------------- Helper Functions ----------------
+def download_and_extract_text(pdf_url: str) -> str:
+    """Downloads a PDF from Cloudinary and extracts text"""
+    print(f"📥 Downloading PDF from: {pdf_url}")
+    response = requests.get(pdf_url)
+    response.raise_for_status()
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+        tmp.write(response.content)
+        tmp_path = tmp.name
+    doc = fitz.open(tmp_path)
+    text = "\n".join(page.get_text("text") for page in doc)
+    doc.close()
+    os.remove(tmp_path)
+    print("✅ PDF text extracted successfully.")
+    return text
+async def extract_from_text(text: str) -> dict:
+    """Runs the agent to extract structured data"""
+    print(f"📄 Extracting from {len(text)} characters...")
+    resp = await runner.run(
+        student_agent,
+        text,  # ✅ plain text only
+        session=SQLiteSession("student_trace.db")
+    )
+    if hasattr(resp, "output"):
+        return resp.output.model_dump()
+    elif hasattr(resp, "final_output"):
+        return resp.final_output.model_dump()
+    return {"students": []}
+# ---------------- FastAPI Endpoint ----------------
+@app.post("/extract-student")
+async def extract_student(req: PDFRequest):
+    """
+    Accepts a Cloudinary PDF URL,
+    downloads it, extracts text, and returns structured student data.
+    """
+    try:
+        text = download_and_extract_text(req.pdfUrl)
+        structured = await extract_from_text(text)
+        return {
+            "success": True,
+            "pdfUrl": req.pdfUrl,
+            "structured": structured,
+            "raw_text_preview": text[:800]  # trimmed preview
+        }
+    except Exception as e:
+        return {"success": False, "error": str(e)}

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+fastapi
+python-dotenv
+uvicorn
+pydantic
+openai-agents
+PyMuPDF
+requests