|
|
import requests,time,csv,re,json,sys,math,random,io |
|
|
import uuid,shutil,logging,os |
|
|
from pathlib import Path |
|
|
from typing import Optional,Tuple |
|
|
from datetime import datetime, timezone,timedelta |
|
|
|
|
|
from fastapi import FastAPI, Query, HTTPException, Header, BackgroundTasks, Request, Response |
|
|
from fastapi.responses import HTMLResponse, JSONResponse,StreamingResponse,FileResponse |
|
|
from fastapi.middleware.cors import CORSMiddleware |
|
|
from starlette.concurrency import run_in_threadpool |
|
|
from pydantic import BaseModel |
|
|
from typing import Literal |
|
|
|
|
|
try: |
|
|
import processor |
|
|
except Exception as e: |
|
|
raise RuntimeError(f"Failed to import processor.py: {e}") |
|
|
|
|
|
from reddit_scrapper import scrape_reddit_to_csv |
|
|
|
|
|
|
|
|
DOCX_AVAILABLE = True |
|
|
try: |
|
|
from docx import Document |
|
|
from docx.shared import Inches |
|
|
except Exception: |
|
|
DOCX_AVAILABLE = False |
|
|
|
|
|
class RerunRequest(BaseModel): |
|
|
intent: Literal["light", "medium", "deep"] |
|
|
|
|
|
INTENT_LIMITS = { |
|
|
"light": {"per_query": 10, "total": 25}, |
|
|
"medium": {"per_query": 50, "total": 300}, |
|
|
"deep": {"per_query": 100, "total": 800}, |
|
|
} |
|
|
|
|
|
|
|
|
BASE_DIR= Path(__file__).resolve().parent |
|
|
STORAGE_DIR= BASE_DIR/"storage" |
|
|
LATEST_DIR= STORAGE_DIR/"latest" |
|
|
STORAGE_DIR.mkdir(exist_ok=True) |
|
|
LATEST_DIR.mkdir(exist_ok=True) |
|
|
|
|
|
|
|
|
API_KEY= os.environ.get("API_KEY",None) |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
logger= logging.getLogger("report-saver") |
|
|
|
|
|
|
|
|
app= FastAPI(title="Auto Report API (CSV → PDF/DOCX)") |
|
|
|
|
|
|
|
|
origins=[ |
|
|
"https://ciis-indol.vercel.app", |
|
|
"http://localhost:8080", |
|
|
"http://127.0.0.1:8080", |
|
|
"http://localhost:5173", |
|
|
"http://127.0.0.1:5173", |
|
|
"http://localhost:8000", |
|
|
"http://127.0.0.1:8000" |
|
|
] |
|
|
|
|
|
app.add_middleware(CORSMiddleware, allow_origins=origins,allow_credentials=True, allow_methods=["*"],allow_headers=["*"]) |
|
|
|
|
|
|
|
|
def storage_path(filename:str)-> Path: |
|
|
return LATEST_DIR/filename |
|
|
|
|
|
def scrape_live_data(output_csv_path:str, per_query: int, total:int)->None: |
|
|
scrape_reddit_to_csv(output_csv_path,per_query,total) |
|
|
|
|
|
|
|
|
|
|
|
def get_range_byte_positions(range_header: str, file_size: int) -> Optional[Tuple[int, int]]: |
|
|
|
|
|
if not range_header: |
|
|
return None |
|
|
header = range_header.strip() |
|
|
if not header.startswith("bytes="): |
|
|
return None |
|
|
range_val = header.split("=", 1)[1] |
|
|
parts = range_val.split("-") |
|
|
try: |
|
|
if parts[0] == "": |
|
|
|
|
|
end = file_size - 1 |
|
|
start = file_size - int(parts[1]) |
|
|
elif parts[1] == "": |
|
|
|
|
|
start = int(parts[0]) |
|
|
end = file_size - 1 |
|
|
else: |
|
|
start = int(parts[0]) |
|
|
end = int(parts[1]) |
|
|
if start < 0: |
|
|
start = 0 |
|
|
if end >= file_size: |
|
|
end = file_size - 1 |
|
|
if start > end: |
|
|
return None |
|
|
return (start, end) |
|
|
except Exception: |
|
|
return None |
|
|
|
|
|
def range_stream_response(path: Path, request: Request) -> StreamingResponse: |
|
|
"""Return a StreamingResponse that honors Range requests for a file.""" |
|
|
file_size = path.stat().st_size |
|
|
range_header = request.headers.get("range") |
|
|
range_pos = get_range_byte_positions(range_header, file_size) |
|
|
headers = { |
|
|
"Accept-Ranges": "bytes", |
|
|
"Content-Type": "application/octet-stream", |
|
|
"Content-Disposition": f'inline; filename="{path.name}"', |
|
|
} |
|
|
|
|
|
if range_pos is None: |
|
|
|
|
|
def iterfile(): |
|
|
with open(path, "rb") as f: |
|
|
while True: |
|
|
chunk = f.read(1024 * 1024) |
|
|
if not chunk: |
|
|
break |
|
|
yield chunk |
|
|
headers["Content-Length"] = str(file_size) |
|
|
return StreamingResponse(iterfile(), status_code=200, headers=headers) |
|
|
else: |
|
|
start, end = range_pos |
|
|
length = end - start + 1 |
|
|
headers["Content-Length"] = str(length) |
|
|
headers["Content-Range"] = f"bytes {start}-{end}/{file_size}" |
|
|
|
|
|
def iterfile_range(): |
|
|
with open(path, "rb") as f: |
|
|
f.seek(start) |
|
|
remaining = length |
|
|
chunk_size = 1024 * 1024 |
|
|
while remaining > 0: |
|
|
to_read = min(chunk_size, remaining) |
|
|
chunk = f.read(to_read) |
|
|
if not chunk: |
|
|
break |
|
|
remaining -= len(chunk) |
|
|
yield chunk |
|
|
return StreamingResponse(iterfile_range(), status_code=206, headers=headers) |
|
|
|
|
|
|
|
|
@app.get("/") |
|
|
def home(): |
|
|
return {"message":"sever working"} |
|
|
|
|
|
@app.post("/rerun") |
|
|
async def rerun_endpoint(body: RerunRequest, x_api_key: Optional[str] = Header(None)): |
|
|
""" |
|
|
Trigger live scraping + processing. |
|
|
Optional x-api-key header if API_KEY is set in env. |
|
|
This endpoint blocks until processing completes and returns file paths. |
|
|
""" |
|
|
|
|
|
if API_KEY: |
|
|
if not x_api_key or x_api_key != API_KEY: |
|
|
logger.warning("Rejected rerun: invalid API key") |
|
|
raise HTTPException(status_code=401, detail="Invalid or missing x-api-key") |
|
|
|
|
|
|
|
|
|
|
|
work_dir = STORAGE_DIR / "latest" |
|
|
work_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
input_csv = work_dir / "scraped_input.csv" |
|
|
limits= INTENT_LIMITS[body.intent] |
|
|
logger.info(f"Received rerun request. Intent: {body.intent}, Limits: {limits}") |
|
|
|
|
|
try: |
|
|
logger.info(f"Starting scraping to {input_csv}...") |
|
|
scrape_live_data(str(input_csv),int(limits["per_query"]),int(limits["total"])) |
|
|
logger.info("Scraping completed successfully.") |
|
|
except Exception as e: |
|
|
logger.exception("Scraping failed: %s", e) |
|
|
raise HTTPException(status_code=500, detail=f"Scraping failed: {e}") |
|
|
|
|
|
|
|
|
try: |
|
|
logger.info("Calling user-provided processor.generate_reports_from_csv") |
|
|
|
|
|
out = processor.generate_reports_from_csv(str(input_csv), str(work_dir)) |
|
|
logger.info(f"Processing return value: {out}") |
|
|
|
|
|
|
|
|
pdf_path = str(work_dir / "report.pdf") |
|
|
csv_path = str(work_dir / "analysis_output.csv") |
|
|
docx_path = str(work_dir / "report.docx") |
|
|
|
|
|
if isinstance(out, dict): |
|
|
pdf_path = out.get("pdf", pdf_path) |
|
|
csv_path = out.get("csv", csv_path) |
|
|
docx_path = out.get("docx", docx_path) |
|
|
result = {"pdf": pdf_path, "csv": csv_path, "docx": docx_path} |
|
|
except Exception as e: |
|
|
logger.exception("Processing failed: %s", e) |
|
|
raise HTTPException(status_code=500, detail=f"Processing failed: {e}") |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
|
|
|
|
|
|
LATEST_DIR.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
IST = timezone(timedelta(hours=5, minutes=30)) |
|
|
generated_at = datetime.now(IST).strftime("%Y-%m-%d %H:%M:%S") |
|
|
|
|
|
meta = { |
|
|
"pdf": "/files/report.pdf" if (LATEST_DIR / "report.pdf").exists() else "", |
|
|
"csv": "/files/analysis_output.csv" if (LATEST_DIR / "analysis_output.csv").exists() else "", |
|
|
"docx": "/files/report.docx" if (LATEST_DIR / "report.docx").exists() else "", |
|
|
"generated_at": generated_at, |
|
|
} |
|
|
|
|
|
|
|
|
with open(LATEST_DIR / "meta.json", "w", encoding="utf-8") as mf: |
|
|
import json |
|
|
json.dump(meta, mf) |
|
|
|
|
|
except Exception as e: |
|
|
logger.exception("Failed to update latest storage: %s", e) |
|
|
raise HTTPException(status_code=500, detail=f"Failed to update latest storage: {e}") |
|
|
|
|
|
logger.info("Rerun completed, files available under latest/ directory") |
|
|
return JSONResponse(status_code=200, content={ |
|
|
"status": "ok", |
|
|
"pdf": meta["pdf"], |
|
|
"csv": meta["csv"], |
|
|
"docx": meta["docx"] |
|
|
}) |
|
|
|
|
|
|
|
|
@app.get("/report") |
|
|
async def get_report(): |
|
|
""" |
|
|
Return metadata about current report (pdf/csv/docx) |
|
|
""" |
|
|
meta_file = LATEST_DIR / "meta.json" |
|
|
if not meta_file.exists(): |
|
|
raise HTTPException(status_code=404, detail="No report available yet") |
|
|
import json |
|
|
with open(meta_file, "r", encoding="utf-8") as f: |
|
|
meta = json.load(f) |
|
|
return JSONResponse(status_code=200, content=meta) |
|
|
|
|
|
@app.get("/pdf/view/{filename}") |
|
|
async def view_pdf(filename: str): |
|
|
path = LATEST_DIR / filename |
|
|
if not path.exists(): |
|
|
raise HTTPException(404, "File not found") |
|
|
|
|
|
return FileResponse( |
|
|
path, |
|
|
media_type="application/pdf", |
|
|
headers={ |
|
|
"Content-Disposition": f'inline; filename="{path.name}"' |
|
|
} |
|
|
) |
|
|
|
|
|
@app.get("/pdf/download/{filename}") |
|
|
async def download_pdf(filename: str): |
|
|
path = LATEST_DIR / filename |
|
|
if not path.exists(): |
|
|
raise HTTPException(404, "File not found") |
|
|
|
|
|
return FileResponse( |
|
|
path, |
|
|
media_type="application/pdf", |
|
|
headers={ |
|
|
"Content-Disposition": f'attachment; filename="{path.name}"' |
|
|
} |
|
|
) |
|
|
|
|
|
|
|
|
@app.get("/files/{filename}") |
|
|
async def serve_file(filename: str, request: Request): |
|
|
""" |
|
|
Serve files from the latest directory. Supports Range requests (for PDFs). |
|
|
""" |
|
|
safe_name = os.path.basename(filename) |
|
|
path = LATEST_DIR / safe_name |
|
|
if not path.exists() or not path.is_file(): |
|
|
raise HTTPException(status_code=404, detail="File not found") |
|
|
|
|
|
if path.suffix.lower() == ".pdf": |
|
|
media_type = "application/pdf" |
|
|
elif path.suffix.lower() == ".csv": |
|
|
media_type = "text/csv" |
|
|
elif path.suffix.lower() == ".docx": |
|
|
media_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" |
|
|
else: |
|
|
media_type = "application/octet-stream" |
|
|
|
|
|
|
|
|
range_header = request.headers.get("range") |
|
|
if range_header and path.suffix.lower() == ".pdf": |
|
|
return range_stream_response(path, request) |
|
|
else: |
|
|
|
|
|
def file_iterator(): |
|
|
with open(path, "rb") as f: |
|
|
while True: |
|
|
chunk = f.read(1024 * 1024) |
|
|
if not chunk: |
|
|
break |
|
|
yield chunk |
|
|
headers = { |
|
|
"Content-Disposition": f'inline; filename="{path.name}"', |
|
|
"Content-Length": str(path.stat().st_size), |
|
|
} |
|
|
return StreamingResponse(file_iterator(), media_type=media_type, headers=headers) |
|
|
|
|
|
if __name__=='__main__': |
|
|
import uvicorn |
|
|
port= int(os.environ.get("PORT",8000)) |
|
|
logger.info("Starting on port %s",port) |
|
|
uvicorn.run("main:app",host="0.0.0.0",port=port,log_level="info") |