Spaces:

Hammad712
/

ingestion

Sleeping

App Files Files Community

Hammad712 commited on Dec 28, 2025

Commit

906c82a

1 Parent(s): f83819a

Added new files endpoints

Browse files

Files changed (3) hide show

app/main.py +5 -2
app/routes/files.py +54 -0
app/utils.py +40 -1

app/main.py CHANGED Viewed

@@ -9,9 +9,9 @@ from .services import model_client
 # Import your routers explicitly
 # Note: Ensure process.py and health.py are accessible.
-# If they are in a 'routes' folder, change to: from .routes import process, health
 try:
-    from . import process, health
 except ImportError:
     # Fallback if files are inside a 'routes' package
     from .routes import process, health
@@ -80,6 +80,9 @@ def startup_event():
 # --- Router Registration ---
 # Mount the Processing Router (e.g., /process/pdf/stream)
 app.include_router(process.router, prefix="/process", tags=["Process"])

 # Import your routers explicitly
 # Note: Ensure process.py and health.py are accessible.
+# If they are in a 'routes' folder, change to: from .routes import process, health, files
 try:
+    from . import process, health, files
 except ImportError:
     # Fallback if files are inside a 'routes' package
     from .routes import process, health
 # --- Router Registration ---
+# Mount the File Management Router (e.g., /files/upload)
+app.include_router(files.router, prefix="/files", tags=["File Management"]) # <--- Register here
 # Mount the Processing Router (e.g., /process/pdf/stream)
 app.include_router(process.router, prefix="/process", tags=["Process"])

app/routes/files.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from fastapi import APIRouter, HTTPException, Query
+from starlette.concurrency import run_in_threadpool
+from typing import Optional
+from ..utils import (
+    list_all_jobs,
+    get_job_by_filename,
+    get_signed_url
+)
+router = APIRouter()
+@router.get("/list")
+async def list_files():
+    """
+    Returns a list of all uploaded PDF filenames and their corresponding
+    Markdown report paths currently stored in the database.
+    """
+    result = await run_in_threadpool(list_all_jobs)
+    return {
+        "status": "success",
+        "count": len(result["pdf_files"]),
+        "uploaded_pdfs": result["pdf_files"],
+        "generated_reports": result["md_files"]
+    }
+@router.get("/download")
+async def download_file_by_name(filename: str = Query(..., description="The exact name of the uploaded PDF file")):
+    """
+    Takes a file name (e.g., 'document.pdf') as input and returns the
+    download URL for its generated Markdown report.
+    """
+    # 1. Find the job associated with this filename
+    job = await run_in_threadpool(get_job_by_filename, filename)
+    if not job:
+        raise HTTPException(status_code=404, detail=f"File '{filename}' not found in records.")
+    # 2. Get the path to the report
+    report_path = job.get("report")
+    if not report_path:
+        raise HTTPException(status_code=404, detail="Report path is missing for this file.")
+    # 3. Generate a secure download link
+    download_url = await run_in_threadpool(get_signed_url, report_path)
+    if not download_url:
+        raise HTTPException(status_code=500, detail="Could not generate download link.")
+    return {
+        "filename": filename,
+        "job_id": job.get("uuid"),
+        "report_download_url": download_url
+    }

app/utils.py CHANGED Viewed

@@ -112,4 +112,43 @@ def start_cleanup_thread(retention_seconds: int = 24 * 3600, interval_seconds: i
     import threading as _th
     t = _th.Thread(target=cleanup_expired_reports, args=(retention_seconds, interval_seconds), daemon=True)
     t.start()
-    _cleanup_thread_started = True

     import threading as _th
     t = _th.Thread(target=cleanup_expired_reports, args=(retention_seconds, interval_seconds), daemon=True)
     t.start()
+    _cleanup_thread_started = True
+def get_job_by_filename(filename: str) -> Optional[Dict]:
+    """Fetch a job entry by its original filename."""
+    try:
+        # Query Supabase for the filename
+        response = supabase.table("job_metadata")\
+            .select("*")\
+            .eq("original_filename", filename)\
+            .limit(1)\
+            .execute()
+        if response.data and len(response.data) > 0:
+            row = response.data[0]
+            return {
+                "uuid": row["job_id"],
+                "original_filename": row["original_filename"],
+                "report": row["report_path"],
+                "created_at": row["created_at"],
+                "expires_at": row["expires_at"]
+            }
+    except Exception as e:
+        print(f"Error checking duplicate: {e}")
+    return None
+def list_all_jobs(limit: int = 100) -> Dict[str, List[str]]:
+    """Return a separated list of PDF filenames and MD report paths."""
+    try:
+        data = read_metadata(limit)
+        pdf_files = [item["original_filename"] for item in data]
+        md_files = [item["report"] for item in data if item.get("report")]
+        return {
+            "pdf_files": pdf_files,
+            "md_files": md_files,
+            "full_data": data # Useful if frontend needs ID mapping
+        }
+    except Exception as e:
+        print(f"Error listing files: {e}")
+        return {"pdf_files": [], "md_files": []}