Spaces:

WizardCoder2007
/

social_media_analyzer

Running

App Files Files Community

social_media_analyzer / main.py

WizardCoder2007

update

96fd51a 10 days ago

raw

history blame contribute delete

11.6 kB

	import requests,time,csv,re,json,sys,math,random,io
	import uuid,shutil,logging,os
	from pathlib import Path
	from typing import Optional,Tuple
	from datetime import datetime, timezone,timedelta

	from fastapi import FastAPI, Query, HTTPException, Header, BackgroundTasks, Request, Response
	from fastapi.responses import HTMLResponse, JSONResponse,StreamingResponse,FileResponse
	from fastapi.middleware.cors import CORSMiddleware
	from starlette.concurrency import run_in_threadpool
	from pydantic import BaseModel
	from typing import Literal

	try:
	import processor
	except Exception as e:
	raise RuntimeError(f"Failed to import processor.py: {e}")

	from reddit_scrapper import scrape_reddit_to_csv

	# try import python-docx (optional)
	DOCX_AVAILABLE = True
	try:
	from docx import Document
	from docx.shared import Inches
	except Exception:
	DOCX_AVAILABLE = False

	class RerunRequest(BaseModel):
	intent: Literal["light", "medium", "deep"]

	INTENT_LIMITS = {
	"light": {"per_query": 10, "total": 25},
	"medium": {"per_query": 50, "total": 300},
	"deep": {"per_query": 100, "total": 800},
	}

	# ---- Configuration ----
	BASE_DIR= Path(__file__).resolve().parent
	STORAGE_DIR= BASE_DIR/"storage"
	LATEST_DIR= STORAGE_DIR/"latest"
	STORAGE_DIR.mkdir(exist_ok=True)
	LATEST_DIR.mkdir(exist_ok=True)

	# API key (optional) if set in env required for post/rerun
	API_KEY= os.environ.get("API_KEY",None)

	# logging
	logging.basicConfig(level=logging.INFO)
	logger= logging.getLogger("report-saver")

	# FastAPI code
	app= FastAPI(title="Auto Report API (CSV → PDF/DOCX)")

	# CORS allow all in dev, restrict in production
	origins=[
	"https://ciis-indol.vercel.app",
	"http://localhost:8080",
	"http://127.0.0.1:8080",
	"http://localhost:5173",
	"http://127.0.0.1:5173",
	"http://localhost:8000",
	"http://127.0.0.1:8000"
	]

	app.add_middleware(CORSMiddleware, allow_origins=origins,allow_credentials=True, allow_methods=[""],allow_headers=[""])

	# Helper: safe path join inside storage using Path
	def storage_path(filename:str)-> Path:
	return LATEST_DIR/filename

	def scrape_live_data(output_csv_path:str, per_query: int, total:int)->None:
	scrape_reddit_to_csv(output_csv_path,per_query,total)

	# ------------------------------
	# Range-supporting file response for large files (PDF preview)
	def get_range_byte_positions(range_header: str, file_size: int) -> Optional[Tuple[int, int]]:
	# Example Range header: 'bytes=0-1023' or 'bytes=1024-'
	if not range_header:
	return None
	header = range_header.strip()
	if not header.startswith("bytes="):
	return None
	range_val = header.split("=", 1)[1]
	parts = range_val.split("-")
	try:
	if parts[0] == "":
	# suffix bytes: '-N' -> last N bytes
	end = file_size - 1
	start = file_size - int(parts[1])
	elif parts[1] == "":
	# 'start-' to end of file
	start = int(parts[0])
	end = file_size - 1
	else:
	start = int(parts[0])
	end = int(parts[1])
	if start < 0:
	start = 0
	if end >= file_size:
	end = file_size - 1
	if start > end:
	return None
	return (start, end)
	except Exception:
	return None

	def range_stream_response(path: Path, request: Request) -> StreamingResponse:
	"""Return a StreamingResponse that honors Range requests for a file."""
	file_size = path.stat().st_size
	range_header = request.headers.get("range")
	range_pos = get_range_byte_positions(range_header, file_size)
	headers = {
	"Accept-Ranges": "bytes",
	"Content-Type": "application/octet-stream",
	"Content-Disposition": f'inline; filename="{path.name}"',
	}

	if range_pos is None:
	# full content
	def iterfile():
	with open(path, "rb") as f:
	while True:
	chunk = f.read(1024 * 1024)
	if not chunk:
	break
	yield chunk
	headers["Content-Length"] = str(file_size)
	return StreamingResponse(iterfile(), status_code=200, headers=headers)
	else:
	start, end = range_pos
	length = end - start + 1
	headers["Content-Length"] = str(length)
	headers["Content-Range"] = f"bytes {start}-{end}/{file_size}"
	# status 206 Partial Content
	def iterfile_range():
	with open(path, "rb") as f:
	f.seek(start)
	remaining = length
	chunk_size = 1024 * 1024
	while remaining > 0:
	to_read = min(chunk_size, remaining)
	chunk = f.read(to_read)
	if not chunk:
	break
	remaining -= len(chunk)
	yield chunk
	return StreamingResponse(iterfile_range(), status_code=206, headers=headers)


	@app.get("/")
	def home():
	return {"message":"sever working"}

	@app.post("/rerun")
	async def rerun_endpoint(body: RerunRequest, x_api_key: Optional[str] = Header(None)):
	"""
	Trigger live scraping + processing.
	Optional x-api-key header if API_KEY is set in env.
	This endpoint blocks until processing completes and returns file paths.
	"""
	# auth check
	if API_KEY:
	if not x_api_key or x_api_key != API_KEY:
	logger.warning("Rejected rerun: invalid API key")
	raise HTTPException(status_code=401, detail="Invalid or missing x-api-key")

	# create a new working folder
	# uid = uuid.uuid4().hex
	work_dir = STORAGE_DIR / "latest"
	work_dir.mkdir(parents=True, exist_ok=True)

	# step 1: scrape live data -> create input CSV path
	input_csv = work_dir / "scraped_input.csv"
	limits= INTENT_LIMITS[body.intent]
	logger.info(f"Received rerun request. Intent: {body.intent}, Limits: {limits}")

	try:
	logger.info(f"Starting scraping to {input_csv}...")
	scrape_live_data(str(input_csv),int(limits["per_query"]),int(limits["total"]))
	logger.info("Scraping completed successfully.")
	except Exception as e:
	logger.exception("Scraping failed: %s", e)
	raise HTTPException(status_code=500, detail=f"Scraping failed: {e}")

	# step 2: process csv into pdf, docx, analysis_output.csv
	try:
	logger.info("Calling user-provided processor.generate_reports_from_csv")
	# assume processor writes to out_dir and returns dict or nothing
	out = processor.generate_reports_from_csv(str(input_csv), str(work_dir))
	logger.info(f"Processing return value: {out}")

	# normalize result
	pdf_path = str(work_dir / "report.pdf")
	csv_path = str(work_dir / "analysis_output.csv")
	docx_path = str(work_dir / "report.docx")
	# if processor returned explicit paths, use them
	if isinstance(out, dict):
	pdf_path = out.get("pdf", pdf_path)
	csv_path = out.get("csv", csv_path)
	docx_path = out.get("docx", docx_path)
	result = {"pdf": pdf_path, "csv": csv_path, "docx": docx_path}
	except Exception as e:
	logger.exception("Processing failed: %s", e)
	raise HTTPException(status_code=500, detail=f"Processing failed: {e}")

	# step 3: update 'latest' storage (atomically)
	try:
	# clear latest directory
	# if LATEST_DIR.exists():
	# shutil.rmtree(LATEST_DIR)
	LATEST_DIR.mkdir(parents=True, exist_ok=True)
	# Define IST timezone
	IST = timezone(timedelta(hours=5, minutes=30))
	generated_at = datetime.now(IST).strftime("%Y-%m-%d %H:%M:%S")
	# write metadata file
	meta = {
	"pdf": "/files/report.pdf" if (LATEST_DIR / "report.pdf").exists() else "",
	"csv": "/files/analysis_output.csv" if (LATEST_DIR / "analysis_output.csv").exists() else "",
	"docx": "/files/report.docx" if (LATEST_DIR / "report.docx").exists() else "",
	"generated_at": generated_at,
	}

	# write meta to disk for persistence
	with open(LATEST_DIR / "meta.json", "w", encoding="utf-8") as mf:
	import json
	json.dump(meta, mf)

	except Exception as e:
	logger.exception("Failed to update latest storage: %s", e)
	raise HTTPException(status_code=500, detail=f"Failed to update latest storage: {e}")

	logger.info("Rerun completed, files available under latest/ directory")
	return JSONResponse(status_code=200, content={
	"status": "ok",
	"pdf": meta["pdf"],
	"csv": meta["csv"],
	"docx": meta["docx"]
	})


	@app.get("/report")
	async def get_report():
	"""
	Return metadata about current report (pdf/csv/docx)
	"""
	meta_file = LATEST_DIR / "meta.json"
	if not meta_file.exists():
	raise HTTPException(status_code=404, detail="No report available yet")
	import json
	with open(meta_file, "r", encoding="utf-8") as f:
	meta = json.load(f)
	return JSONResponse(status_code=200, content=meta)

	@app.get("/pdf/view/{filename}")
	async def view_pdf(filename: str):
	path = LATEST_DIR / filename
	if not path.exists():
	raise HTTPException(404, "File not found")

	return FileResponse(
	path,
	media_type="application/pdf",
	headers={
	"Content-Disposition": f'inline; filename="{path.name}"'
	}
	)

	@app.get("/pdf/download/{filename}")
	async def download_pdf(filename: str):
	path = LATEST_DIR / filename
	if not path.exists():
	raise HTTPException(404, "File not found")

	return FileResponse(
	path,
	media_type="application/pdf",
	headers={
	"Content-Disposition": f'attachment; filename="{path.name}"'
	}
	)


	@app.get("/files/{filename}")
	async def serve_file(filename: str, request: Request):
	"""
	Serve files from the latest directory. Supports Range requests (for PDFs).
	"""
	safe_name = os.path.basename(filename)
	path = LATEST_DIR / safe_name
	if not path.exists() or not path.is_file():
	raise HTTPException(status_code=404, detail="File not found")
	# Detect file type
	if path.suffix.lower() == ".pdf":
	media_type = "application/pdf"
	elif path.suffix.lower() == ".csv":
	media_type = "text/csv"
	elif path.suffix.lower() == ".docx":
	media_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
	else:
	media_type = "application/octet-stream"

	# if the client supports Range (commonly for PDFs), use range_stream_response
	range_header = request.headers.get("range")
	if range_header and path.suffix.lower() == ".pdf":
	return range_stream_response(path, request)
	else:
	# full file streaming
	def file_iterator():
	with open(path, "rb") as f:
	while True:
	chunk = f.read(1024 * 1024)
	if not chunk:
	break
	yield chunk
	headers = {
	"Content-Disposition": f'inline; filename="{path.name}"',
	"Content-Length": str(path.stat().st_size),
	}
	return StreamingResponse(file_iterator(), media_type=media_type, headers=headers)

	if __name__=='__main__':
	import uvicorn
	port= int(os.environ.get("PORT",8000))
	logger.info("Starting on port %s",port)
	uvicorn.run("main:app",host="0.0.0.0",port=port,log_level="info")