Commit ·
1e36e31
1
Parent(s): 8892a43
commit
Browse files- .gitignore +18 -0
- Dockerfile +25 -0
- main.py +326 -0
- processor.py +503 -0
- reddit_scrapper.py +192 -0
- requirements.txt +133 -0
.gitignore
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
venv/
|
| 2 |
+
.venv/
|
| 3 |
+
__pycache__/
|
| 4 |
+
*.pyc
|
| 5 |
+
.env
|
| 6 |
+
.env.local
|
| 7 |
+
.env.example
|
| 8 |
+
.env.debug
|
| 9 |
+
.env.production
|
| 10 |
+
.env.test
|
| 11 |
+
.env.development
|
| 12 |
+
|
| 13 |
+
storage/latest/
|
| 14 |
+
*.png
|
| 15 |
+
*.pdf
|
| 16 |
+
*.csv
|
| 17 |
+
*.json
|
| 18 |
+
*.docx
|
Dockerfile
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 4 |
+
ENV PYTHONUNBUFFERED=1
|
| 5 |
+
|
| 6 |
+
WORKDIR /app
|
| 7 |
+
|
| 8 |
+
# System deps (for matplotlib, reportlab, wordcloud)
|
| 9 |
+
RUN apt-get update && apt-get install -y \
|
| 10 |
+
build-essential \
|
| 11 |
+
gcc \
|
| 12 |
+
libglib2.0-0 \
|
| 13 |
+
libsm6 \
|
| 14 |
+
libxext6 \
|
| 15 |
+
libxrender1 \
|
| 16 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 17 |
+
|
| 18 |
+
COPY requirements.txt .
|
| 19 |
+
RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt
|
| 20 |
+
|
| 21 |
+
COPY . .
|
| 22 |
+
|
| 23 |
+
EXPOSE 7860
|
| 24 |
+
|
| 25 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
main.py
ADDED
|
@@ -0,0 +1,326 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests,time,csv,re,json,sys,math,random,io
|
| 2 |
+
import uuid,shutil,logging,os
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Optional,Tuple
|
| 5 |
+
from datetime import datetime, timezone,timedelta
|
| 6 |
+
|
| 7 |
+
from fastapi import FastAPI, Query, HTTPException, Header, BackgroundTasks, Request, Response
|
| 8 |
+
from fastapi.responses import HTMLResponse, JSONResponse,StreamingResponse,FileResponse
|
| 9 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 10 |
+
from starlette.concurrency import run_in_threadpool
|
| 11 |
+
from pydantic import BaseModel
|
| 12 |
+
from typing import Literal
|
| 13 |
+
|
| 14 |
+
try:
|
| 15 |
+
import processor
|
| 16 |
+
except Exception as e:
|
| 17 |
+
raise RuntimeError(f"Failed to import processor.py: {e}")
|
| 18 |
+
|
| 19 |
+
from reddit_scrapper import scrape_reddit_to_csv
|
| 20 |
+
|
| 21 |
+
# try import python-docx (optional)
|
| 22 |
+
DOCX_AVAILABLE = True
|
| 23 |
+
try:
|
| 24 |
+
from docx import Document
|
| 25 |
+
from docx.shared import Inches
|
| 26 |
+
except Exception:
|
| 27 |
+
DOCX_AVAILABLE = False
|
| 28 |
+
|
| 29 |
+
class RerunRequest(BaseModel):
|
| 30 |
+
intent: Literal["light", "medium", "deep"]
|
| 31 |
+
|
| 32 |
+
INTENT_LIMITS = {
|
| 33 |
+
"light": {"per_query": 20, "total": 40},
|
| 34 |
+
"medium": {"per_query": 50, "total": 300},
|
| 35 |
+
"deep": {"per_query": 100, "total": 800},
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
# ---- Configuration ----
|
| 39 |
+
BASE_DIR= Path(__file__).resolve().parent
|
| 40 |
+
STORAGE_DIR= BASE_DIR/"storage"
|
| 41 |
+
LATEST_DIR= STORAGE_DIR/"latest"
|
| 42 |
+
STORAGE_DIR.mkdir(exist_ok=True)
|
| 43 |
+
LATEST_DIR.mkdir(exist_ok=True)
|
| 44 |
+
|
| 45 |
+
# API key (optional) if set in env required for post/rerun
|
| 46 |
+
API_KEY= os.environ.get("API_KEY",None)
|
| 47 |
+
|
| 48 |
+
# logging
|
| 49 |
+
logging.basicConfig(level=logging.INFO)
|
| 50 |
+
logger= logging.getLogger("report-saver")
|
| 51 |
+
|
| 52 |
+
# FastAPI code
|
| 53 |
+
app= FastAPI(title="Auto Report API (CSV → PDF/DOCX)")
|
| 54 |
+
|
| 55 |
+
# CORS allow all in dev, restrict in production
|
| 56 |
+
origins=[
|
| 57 |
+
"https://ciis-indol.vercel.app",
|
| 58 |
+
"http://localhost:8080",
|
| 59 |
+
"http://127.0.0.1:8080",
|
| 60 |
+
"http://localhost:5173",
|
| 61 |
+
"http://127.0.0.1:5173",
|
| 62 |
+
"http://localhost:8000",
|
| 63 |
+
"http://127.0.0.1:8000"
|
| 64 |
+
]
|
| 65 |
+
|
| 66 |
+
app.add_middleware(CORSMiddleware, allow_origins=origins,allow_credentials=True, allow_methods=["*"],allow_headers=["*"])
|
| 67 |
+
|
| 68 |
+
# Helper: safe path join inside storage using Path
|
| 69 |
+
def storage_path(filename:str)-> Path:
|
| 70 |
+
return LATEST_DIR/filename
|
| 71 |
+
|
| 72 |
+
def scrape_live_data(output_csv_path:str, per_query: int, total:int)->None:
|
| 73 |
+
scrape_reddit_to_csv(output_csv_path,per_query,total)
|
| 74 |
+
|
| 75 |
+
# ------------------------------
|
| 76 |
+
# Range-supporting file response for large files (PDF preview)
|
| 77 |
+
def get_range_byte_positions(range_header: str, file_size: int) -> Optional[Tuple[int, int]]:
|
| 78 |
+
# Example Range header: 'bytes=0-1023' or 'bytes=1024-'
|
| 79 |
+
if not range_header:
|
| 80 |
+
return None
|
| 81 |
+
header = range_header.strip()
|
| 82 |
+
if not header.startswith("bytes="):
|
| 83 |
+
return None
|
| 84 |
+
range_val = header.split("=", 1)[1]
|
| 85 |
+
parts = range_val.split("-")
|
| 86 |
+
try:
|
| 87 |
+
if parts[0] == "":
|
| 88 |
+
# suffix bytes: '-N' -> last N bytes
|
| 89 |
+
end = file_size - 1
|
| 90 |
+
start = file_size - int(parts[1])
|
| 91 |
+
elif parts[1] == "":
|
| 92 |
+
# 'start-' to end of file
|
| 93 |
+
start = int(parts[0])
|
| 94 |
+
end = file_size - 1
|
| 95 |
+
else:
|
| 96 |
+
start = int(parts[0])
|
| 97 |
+
end = int(parts[1])
|
| 98 |
+
if start < 0:
|
| 99 |
+
start = 0
|
| 100 |
+
if end >= file_size:
|
| 101 |
+
end = file_size - 1
|
| 102 |
+
if start > end:
|
| 103 |
+
return None
|
| 104 |
+
return (start, end)
|
| 105 |
+
except Exception:
|
| 106 |
+
return None
|
| 107 |
+
|
| 108 |
+
def range_stream_response(path: Path, request: Request) -> StreamingResponse:
|
| 109 |
+
"""Return a StreamingResponse that honors Range requests for a file."""
|
| 110 |
+
file_size = path.stat().st_size
|
| 111 |
+
range_header = request.headers.get("range")
|
| 112 |
+
range_pos = get_range_byte_positions(range_header, file_size)
|
| 113 |
+
headers = {
|
| 114 |
+
"Accept-Ranges": "bytes",
|
| 115 |
+
"Content-Type": "application/octet-stream",
|
| 116 |
+
"Content-Disposition": f'inline; filename="{path.name}"',
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
if range_pos is None:
|
| 120 |
+
# full content
|
| 121 |
+
def iterfile():
|
| 122 |
+
with open(path, "rb") as f:
|
| 123 |
+
while True:
|
| 124 |
+
chunk = f.read(1024 * 1024)
|
| 125 |
+
if not chunk:
|
| 126 |
+
break
|
| 127 |
+
yield chunk
|
| 128 |
+
headers["Content-Length"] = str(file_size)
|
| 129 |
+
return StreamingResponse(iterfile(), status_code=200, headers=headers)
|
| 130 |
+
else:
|
| 131 |
+
start, end = range_pos
|
| 132 |
+
length = end - start + 1
|
| 133 |
+
headers["Content-Length"] = str(length)
|
| 134 |
+
headers["Content-Range"] = f"bytes {start}-{end}/{file_size}"
|
| 135 |
+
# status 206 Partial Content
|
| 136 |
+
def iterfile_range():
|
| 137 |
+
with open(path, "rb") as f:
|
| 138 |
+
f.seek(start)
|
| 139 |
+
remaining = length
|
| 140 |
+
chunk_size = 1024 * 1024
|
| 141 |
+
while remaining > 0:
|
| 142 |
+
to_read = min(chunk_size, remaining)
|
| 143 |
+
chunk = f.read(to_read)
|
| 144 |
+
if not chunk:
|
| 145 |
+
break
|
| 146 |
+
remaining -= len(chunk)
|
| 147 |
+
yield chunk
|
| 148 |
+
return StreamingResponse(iterfile_range(), status_code=206, headers=headers)
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
@app.get("/")
|
| 152 |
+
def home():
|
| 153 |
+
return {"message":"sever working"}
|
| 154 |
+
|
| 155 |
+
@app.post("/rerun")
|
| 156 |
+
async def rerun_endpoint(body: RerunRequest, x_api_key: Optional[str] = Header(None)):
|
| 157 |
+
"""
|
| 158 |
+
Trigger live scraping + processing.
|
| 159 |
+
Optional x-api-key header if API_KEY is set in env.
|
| 160 |
+
This endpoint blocks until processing completes and returns file paths.
|
| 161 |
+
"""
|
| 162 |
+
# auth check
|
| 163 |
+
if API_KEY:
|
| 164 |
+
if not x_api_key or x_api_key != API_KEY:
|
| 165 |
+
logger.warning("Rejected rerun: invalid API key")
|
| 166 |
+
raise HTTPException(status_code=401, detail="Invalid or missing x-api-key")
|
| 167 |
+
|
| 168 |
+
# create a new working folder
|
| 169 |
+
# uid = uuid.uuid4().hex
|
| 170 |
+
work_dir = STORAGE_DIR / "latest"
|
| 171 |
+
work_dir.mkdir(parents=True, exist_ok=True)
|
| 172 |
+
|
| 173 |
+
# step 1: scrape live data -> create input CSV path
|
| 174 |
+
input_csv = work_dir / "scraped_input.csv"
|
| 175 |
+
limits= INTENT_LIMITS[body.intent]
|
| 176 |
+
logger.info(f"Received rerun request. Intent: {body.intent}, Limits: {limits}")
|
| 177 |
+
|
| 178 |
+
try:
|
| 179 |
+
logger.info(f"Starting scraping to {input_csv}...")
|
| 180 |
+
scrape_live_data(str(input_csv),int(limits["per_query"]),int(limits["total"]))
|
| 181 |
+
logger.info("Scraping completed successfully.")
|
| 182 |
+
except Exception as e:
|
| 183 |
+
logger.exception("Scraping failed: %s", e)
|
| 184 |
+
raise HTTPException(status_code=500, detail=f"Scraping failed: {e}")
|
| 185 |
+
|
| 186 |
+
# step 2: process csv into pdf, docx, analysis_output.csv
|
| 187 |
+
try:
|
| 188 |
+
logger.info("Calling user-provided processor.generate_reports_from_csv")
|
| 189 |
+
# assume processor writes to out_dir and returns dict or nothing
|
| 190 |
+
out = processor.generate_reports_from_csv(str(input_csv), str(work_dir))
|
| 191 |
+
logger.info(f"Processing return value: {out}")
|
| 192 |
+
|
| 193 |
+
# normalize result
|
| 194 |
+
pdf_path = str(work_dir / "report.pdf")
|
| 195 |
+
csv_path = str(work_dir / "analysis_output.csv")
|
| 196 |
+
docx_path = str(work_dir / "report.docx")
|
| 197 |
+
# if processor returned explicit paths, use them
|
| 198 |
+
if isinstance(out, dict):
|
| 199 |
+
pdf_path = out.get("pdf", pdf_path)
|
| 200 |
+
csv_path = out.get("csv", csv_path)
|
| 201 |
+
docx_path = out.get("docx", docx_path)
|
| 202 |
+
result = {"pdf": pdf_path, "csv": csv_path, "docx": docx_path}
|
| 203 |
+
except Exception as e:
|
| 204 |
+
logger.exception("Processing failed: %s", e)
|
| 205 |
+
raise HTTPException(status_code=500, detail=f"Processing failed: {e}")
|
| 206 |
+
|
| 207 |
+
# step 3: update 'latest' storage (atomically)
|
| 208 |
+
try:
|
| 209 |
+
# clear latest directory
|
| 210 |
+
# if LATEST_DIR.exists():
|
| 211 |
+
# shutil.rmtree(LATEST_DIR)
|
| 212 |
+
LATEST_DIR.mkdir(parents=True, exist_ok=True)
|
| 213 |
+
# Define IST timezone
|
| 214 |
+
IST = timezone(timedelta(hours=5, minutes=30))
|
| 215 |
+
generated_at = datetime.now(IST).strftime("%Y-%m-%d %H:%M:%S")
|
| 216 |
+
# write metadata file
|
| 217 |
+
meta = {
|
| 218 |
+
"pdf": "/files/report.pdf" if (LATEST_DIR / "report.pdf").exists() else "",
|
| 219 |
+
"csv": "/files/analysis_output.csv" if (LATEST_DIR / "analysis_output.csv").exists() else "",
|
| 220 |
+
"docx": "/files/report.docx" if (LATEST_DIR / "report.docx").exists() else "",
|
| 221 |
+
"generated_at": generated_at,
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
# write meta to disk for persistence
|
| 225 |
+
with open(LATEST_DIR / "meta.json", "w", encoding="utf-8") as mf:
|
| 226 |
+
import json
|
| 227 |
+
json.dump(meta, mf)
|
| 228 |
+
|
| 229 |
+
except Exception as e:
|
| 230 |
+
logger.exception("Failed to update latest storage: %s", e)
|
| 231 |
+
raise HTTPException(status_code=500, detail=f"Failed to update latest storage: {e}")
|
| 232 |
+
|
| 233 |
+
logger.info("Rerun completed, files available under latest/ directory")
|
| 234 |
+
return JSONResponse(status_code=200, content={
|
| 235 |
+
"status": "ok",
|
| 236 |
+
"pdf": meta["pdf"],
|
| 237 |
+
"csv": meta["csv"],
|
| 238 |
+
"docx": meta["docx"]
|
| 239 |
+
})
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
@app.get("/report")
|
| 243 |
+
async def get_report():
|
| 244 |
+
"""
|
| 245 |
+
Return metadata about current report (pdf/csv/docx)
|
| 246 |
+
"""
|
| 247 |
+
meta_file = LATEST_DIR / "meta.json"
|
| 248 |
+
if not meta_file.exists():
|
| 249 |
+
raise HTTPException(status_code=404, detail="No report available yet")
|
| 250 |
+
import json
|
| 251 |
+
with open(meta_file, "r", encoding="utf-8") as f:
|
| 252 |
+
meta = json.load(f)
|
| 253 |
+
return JSONResponse(status_code=200, content=meta)
|
| 254 |
+
|
| 255 |
+
@app.get("/pdf/view/{filename}")
|
| 256 |
+
async def view_pdf(filename: str):
|
| 257 |
+
path = LATEST_DIR / filename
|
| 258 |
+
if not path.exists():
|
| 259 |
+
raise HTTPException(404, "File not found")
|
| 260 |
+
|
| 261 |
+
return FileResponse(
|
| 262 |
+
path,
|
| 263 |
+
media_type="application/pdf",
|
| 264 |
+
headers={
|
| 265 |
+
"Content-Disposition": f'inline; filename="{path.name}"'
|
| 266 |
+
}
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
@app.get("/pdf/download/{filename}")
|
| 270 |
+
async def download_pdf(filename: str):
|
| 271 |
+
path = LATEST_DIR / filename
|
| 272 |
+
if not path.exists():
|
| 273 |
+
raise HTTPException(404, "File not found")
|
| 274 |
+
|
| 275 |
+
return FileResponse(
|
| 276 |
+
path,
|
| 277 |
+
media_type="application/pdf",
|
| 278 |
+
headers={
|
| 279 |
+
"Content-Disposition": f'attachment; filename="{path.name}"'
|
| 280 |
+
}
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
@app.get("/files/{filename}")
|
| 285 |
+
async def serve_file(filename: str, request: Request):
|
| 286 |
+
"""
|
| 287 |
+
Serve files from the latest directory. Supports Range requests (for PDFs).
|
| 288 |
+
"""
|
| 289 |
+
safe_name = os.path.basename(filename)
|
| 290 |
+
path = LATEST_DIR / safe_name
|
| 291 |
+
if not path.exists() or not path.is_file():
|
| 292 |
+
raise HTTPException(status_code=404, detail="File not found")
|
| 293 |
+
# Detect file type
|
| 294 |
+
if path.suffix.lower() == ".pdf":
|
| 295 |
+
media_type = "application/pdf"
|
| 296 |
+
elif path.suffix.lower() == ".csv":
|
| 297 |
+
media_type = "text/csv"
|
| 298 |
+
elif path.suffix.lower() == ".docx":
|
| 299 |
+
media_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
| 300 |
+
else:
|
| 301 |
+
media_type = "application/octet-stream"
|
| 302 |
+
|
| 303 |
+
# if the client supports Range (commonly for PDFs), use range_stream_response
|
| 304 |
+
range_header = request.headers.get("range")
|
| 305 |
+
if range_header and path.suffix.lower() == ".pdf":
|
| 306 |
+
return range_stream_response(path, request)
|
| 307 |
+
else:
|
| 308 |
+
# full file streaming
|
| 309 |
+
def file_iterator():
|
| 310 |
+
with open(path, "rb") as f:
|
| 311 |
+
while True:
|
| 312 |
+
chunk = f.read(1024 * 1024)
|
| 313 |
+
if not chunk:
|
| 314 |
+
break
|
| 315 |
+
yield chunk
|
| 316 |
+
headers = {
|
| 317 |
+
"Content-Disposition": f'inline; filename="{path.name}"',
|
| 318 |
+
"Content-Length": str(path.stat().st_size),
|
| 319 |
+
}
|
| 320 |
+
return StreamingResponse(file_iterator(), media_type=media_type, headers=headers)
|
| 321 |
+
|
| 322 |
+
if __name__=='__main__':
|
| 323 |
+
import uvicorn
|
| 324 |
+
port= int(os.environ.get("PORT",8000))
|
| 325 |
+
logger.info("Starting on port %s",port)
|
| 326 |
+
uvicorn.run("main:app",host="0.0.0.0",port=port,log_level="info")
|
processor.py
ADDED
|
@@ -0,0 +1,503 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Updated: supports large tables using LongTable + docx export.
|
| 3 |
+
Processor module.
|
| 4 |
+
Expose: generate_reports_from_csv(input_csv: str, out_dir: str) -> dict
|
| 5 |
+
Produces: out_dir/analysis_output.csv, out_dir/report.pdf, out_dir/report.docx (optional)
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os,re,sys,csv,logging
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
import pandas as pd
|
| 12 |
+
import numpy as np
|
| 13 |
+
import matplotlib.pyplot as plt
|
| 14 |
+
from wordcloud import WordCloud, STOPWORDS
|
| 15 |
+
from transformers import pipeline
|
| 16 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
| 17 |
+
from sklearn.decomposition import LatentDirichletAllocation
|
| 18 |
+
|
| 19 |
+
# reportlab platypus
|
| 20 |
+
from reportlab.platypus import (SimpleDocTemplate, Paragraph, Spacer, PageBreak,
|
| 21 |
+
TableStyle, Image, LongTable)
|
| 22 |
+
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
| 23 |
+
from reportlab.lib import colors
|
| 24 |
+
from reportlab.lib.pagesizes import A4
|
| 25 |
+
from reportlab.lib.units import inch
|
| 26 |
+
from reportlab.lib.enums import TA_LEFT
|
| 27 |
+
|
| 28 |
+
# try import python-docx (optional)
|
| 29 |
+
DOCX_AVAILABLE = True
|
| 30 |
+
try:
|
| 31 |
+
from docx import Document
|
| 32 |
+
from docx.shared import Inches
|
| 33 |
+
except Exception:
|
| 34 |
+
DOCX_AVAILABLE = False
|
| 35 |
+
|
| 36 |
+
logger = logging.getLogger("processor")
|
| 37 |
+
logger.setLevel(logging.INFO)
|
| 38 |
+
|
| 39 |
+
# ---------------- CONFIG ----------------
|
| 40 |
+
CSV_ENCODING = "utf-8"
|
| 41 |
+
MAX_ROWS = None # None => all rows
|
| 42 |
+
TOPIC_COUNT = 3
|
| 43 |
+
|
| 44 |
+
# Table teaser length to avoid massive single-cell height in PDF tables
|
| 45 |
+
TEASER_CHAR_LIMIT = 900
|
| 46 |
+
|
| 47 |
+
# ---------------- UTIL ----------------
|
| 48 |
+
RELATIVE_TIME_RE = re.compile(
|
| 49 |
+
r'(?:(\d+)\s*(second|sec|s|minute|min|m|hour|hr|h|day|d|week|w|month|mo|year|yr|y)s?\s*ago)|\b(yesterday|today|just now|now)\b',
|
| 50 |
+
flags=re.IGNORECASE
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
try:
|
| 54 |
+
import torch
|
| 55 |
+
device = 0 if torch.cuda.is_available() else -1
|
| 56 |
+
except Exception:
|
| 57 |
+
device = -1
|
| 58 |
+
|
| 59 |
+
try:
|
| 60 |
+
sentiment_model = pipeline("sentiment-analysis",
|
| 61 |
+
model="distilbert-base-uncased-finetuned-sst-2-english",
|
| 62 |
+
device=device)
|
| 63 |
+
except Exception as e:
|
| 64 |
+
print("Failed to load requested model:", e)
|
| 65 |
+
try:
|
| 66 |
+
sentiment_model = pipeline("sentiment-analysis", device=device)
|
| 67 |
+
except Exception as ex:
|
| 68 |
+
print("Final sentiment pipeline fallback failed:", ex); sys.exit(1)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def parse_relative_time(s: str, ref: pd.Timestamp):
|
| 72 |
+
if not isinstance(s, str) or s.strip() == "":
|
| 73 |
+
return pd.NaT
|
| 74 |
+
s = s.strip().lower()
|
| 75 |
+
if s in ("just now", "now"):
|
| 76 |
+
return ref
|
| 77 |
+
if s == "today":
|
| 78 |
+
return pd.Timestamp(ref.date())
|
| 79 |
+
if s == "yesterday":
|
| 80 |
+
return ref - pd.Timedelta(days=1)
|
| 81 |
+
s = re.sub(r'\b(an|a)\b', '1', s)
|
| 82 |
+
m = re.search(r'(\d+)\s*(second|sec|s|minute|min|m|hour|hr|h|day|d|week|w|month|mo|year|yr|y)s?\s*ago', s)
|
| 83 |
+
if not m:
|
| 84 |
+
return pd.NaT
|
| 85 |
+
qty = int(m.group(1)); unit = m.group(2).lower()
|
| 86 |
+
if unit in ("second","sec","s"): return ref - pd.Timedelta(seconds=qty)
|
| 87 |
+
if unit in ("minute","min","m"): return ref - pd.Timedelta(minutes=qty)
|
| 88 |
+
if unit in ("hour","hr","h"): return ref - pd.Timedelta(hours=qty)
|
| 89 |
+
if unit in ("day","d"): return ref - pd.Timedelta(days=qty)
|
| 90 |
+
if unit in ("week","w"): return ref - pd.Timedelta(weeks=qty)
|
| 91 |
+
if unit in ("month","mo"): return ref - pd.Timedelta(days=qty * 30)
|
| 92 |
+
if unit in ("year","yr","y"): return ref - pd.Timedelta(days=qty * 365)
|
| 93 |
+
return pd.NaT
|
| 94 |
+
|
| 95 |
+
def clean_text(text: str) -> str:
|
| 96 |
+
if not isinstance(text, str): return ""
|
| 97 |
+
text = re.sub(r"http\S+", "", text)
|
| 98 |
+
text = re.sub(r"@\w+", "", text)
|
| 99 |
+
text = re.sub(r"#\w+", "", text)
|
| 100 |
+
text = re.sub(r"[^A-Za-z\s]", " ", text)
|
| 101 |
+
text = re.sub(r"\s+", " ", text)
|
| 102 |
+
return text.lower().strip()
|
| 103 |
+
|
| 104 |
+
def chunked(iterable, size):
|
| 105 |
+
for i in range(0, len(iterable), size):
|
| 106 |
+
yield iterable[i:i+size]
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def teaser(s, n=TEASER_CHAR_LIMIT):
|
| 110 |
+
if not isinstance(s, str): return ""
|
| 111 |
+
s = s.strip()
|
| 112 |
+
return (s if len(s) <= n else s[:n-1].rsplit(" ",1)[0] + " ...")
|
| 113 |
+
|
| 114 |
+
def parse_score(x):
|
| 115 |
+
if pd.isna(x): return np.nan
|
| 116 |
+
s = str(x)
|
| 117 |
+
m = re.search(r"(-?\d+)", s.replace(",", ""))
|
| 118 |
+
if m: return int(m.group(1))
|
| 119 |
+
nums = re.findall(r"\d+", s)
|
| 120 |
+
return int(nums[0]) if nums else np.nan
|
| 121 |
+
|
| 122 |
+
def parse_time_value(v,ref_ts):
|
| 123 |
+
if isinstance(v, (pd.Timestamp, datetime)): return pd.to_datetime(v)
|
| 124 |
+
if pd.isna(v): return pd.NaT
|
| 125 |
+
s = str(v).strip()
|
| 126 |
+
try:
|
| 127 |
+
parsed = pd.to_datetime(s, errors='coerce', utc=None)
|
| 128 |
+
if pd.notna(parsed): return parsed
|
| 129 |
+
except Exception: pass
|
| 130 |
+
rt = parse_relative_time(s, ref_ts)
|
| 131 |
+
if pd.notna(rt): return pd.to_datetime(rt)
|
| 132 |
+
return pd.NaT
|
| 133 |
+
|
| 134 |
+
def compile_list(lst): return [re.compile(pat, flags=re.IGNORECASE) for pat in lst]
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
# ---------------- India-specific nature detection ----------------
|
| 138 |
+
PRO_INDIA = [r"\bjai hind\b", r"\bvande mataram\b", r"\bpro india\b", r"\bpro-india\b", r"\bsupport (?:india|modi|bjp)\b", r"\bproud of india\b", r"\bindia is great\b"]
|
| 139 |
+
ANTI_INDIA = [r"\banti[- ]?india\b", r"\banti national\b", r"\btraitor\b", r"\banti-india\b", r"\bkill india\b", r"\bboycott india\b"]
|
| 140 |
+
CRITICAL_GOVT = [r"\bmodi sucks\b", r"\bcorrupt government\b", r"\bgovernment (?:is )?failing\b", r"\b(criticis|criticize|criticising) (?:government|modi|bjp)\b", r"\bpolicy (?:failure|fail)\b", r"\banti-corruption\b", r"\bmisgovern(ance|ing)\b", r"\bgovernment (?:policy|policies)"]
|
| 141 |
+
SUPPORT_OPPOSITION = [r"\bsupport (?:congress|aam aadmi|aap|opposition)\b", r"\bvot(e|ing) for .*opposition\b"]
|
| 142 |
+
SEPARATIST = [r"\bazadi\b", r"\bseparatist\b", r"\bsecede\b", r"\bindependence for\b"]
|
| 143 |
+
COMMUNAL = [r"\bcommunal\b", r"\breligious (?:tension|hatred)\b", r"\breligious\b", r"\bminority\b"]
|
| 144 |
+
CALL_TO_ACTION = [r"\bprotest\b", r"\bboycott\b", r"\bjoin (?:the )?protest\b", r"\bstrike\b", r"\brally\b", r"\baction\b"]
|
| 145 |
+
CONSPIRACY = [r"\bforeign funded\b", r"\bdeep state\b", r"\bconspiracy\b", r"\bwestern plot\b", r"\bcia\b", r"\bsecret agenda\b"]
|
| 146 |
+
|
| 147 |
+
PRO_INDIA_RE = compile_list(PRO_INDIA); ANTI_INDIA_RE = compile_list(ANTI_INDIA)
|
| 148 |
+
CRITICAL_GOVT_RE = compile_list(CRITICAL_GOVT); SUPPORT_OPPOSITION_RE = compile_list(SUPPORT_OPPOSITION)
|
| 149 |
+
SEPARATIST_RE = compile_list(SEPARATIST); COMMUNAL_RE = compile_list(COMMUNAL)
|
| 150 |
+
CALL_TO_ACTION_RE = compile_list(CALL_TO_ACTION); CONSPIRACY_RE = compile_list(CONSPIRACY)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def text_matches_any(text, patterns):
|
| 154 |
+
for pat in patterns:
|
| 155 |
+
if pat.search(text or ""): return True
|
| 156 |
+
return False
|
| 157 |
+
|
| 158 |
+
def determine_nature(text, sentiment_label):
|
| 159 |
+
t = (text or "").lower()
|
| 160 |
+
if text_matches_any(t, SEPARATIST_RE): return "separatist"
|
| 161 |
+
if text_matches_any(t, ANTI_INDIA_RE): return "anti-india"
|
| 162 |
+
if text_matches_any(t, PRO_INDIA_RE): return "pro-india"
|
| 163 |
+
if text_matches_any(t, CALL_TO_ACTION_RE): return "call-to-action"
|
| 164 |
+
if text_matches_any(t, COMMUNAL_RE): return "communal"
|
| 165 |
+
if text_matches_any(t, CONSPIRACY_RE): return "conspiratorial"
|
| 166 |
+
if text_matches_any(t, CRITICAL_GOVT_RE): return "critical-of-government"
|
| 167 |
+
if text_matches_any(t, SUPPORT_OPPOSITION_RE): return "supportive-of-opposition"
|
| 168 |
+
s = str(sentiment_label).upper()
|
| 169 |
+
if "POS" in s: return "supportive"
|
| 170 |
+
if "NEG" in s: return "critical"
|
| 171 |
+
return "neutral"
|
| 172 |
+
|
| 173 |
+
# ---------------- DANGEROUS FLAG ----------------
|
| 174 |
+
danger_keywords = ["kill","attack","bomb","violence","terror","terrorist","militant","insurgency","boycott","protest","call to action"]
|
| 175 |
+
pattern = re.compile(r'\b(?:' + '|'.join(map(re.escape, danger_keywords)) + r')\b', flags=re.IGNORECASE)
|
| 176 |
+
|
| 177 |
+
def is_dangerous(text, sentiment):
|
| 178 |
+
if pattern.search(text or ""): return True
|
| 179 |
+
return (str(sentiment).upper() == "NEGATIVE" and text.strip() != "")
|
| 180 |
+
|
| 181 |
+
def generate_reports_from_csv(input_csv:str, out_dir:str) -> dict:
|
| 182 |
+
"""
|
| 183 |
+
Runs full analysis pipeline. Returns dict: {'pdf':..., 'csv':..., 'docx':...}
|
| 184 |
+
"""
|
| 185 |
+
logger.info("Running processing pipeline on %s",input_csv)
|
| 186 |
+
out_dir= Path(out_dir)
|
| 187 |
+
out_dir.mkdir(parents=True,exist_ok=True)
|
| 188 |
+
|
| 189 |
+
# ---------------- READ CSV ----------------
|
| 190 |
+
if not os.path.exists(input_csv):
|
| 191 |
+
print("CSV file not found:", input_csv); sys.exit(1)
|
| 192 |
+
|
| 193 |
+
print("Loading CSV:", input_csv)
|
| 194 |
+
try:
|
| 195 |
+
df_raw = pd.read_csv(input_csv, encoding=CSV_ENCODING, low_memory=False)
|
| 196 |
+
except Exception as e:
|
| 197 |
+
print("Error reading CSV:", e); sys.exit(1)
|
| 198 |
+
|
| 199 |
+
if MAX_ROWS:
|
| 200 |
+
df_raw = df_raw.head(MAX_ROWS)
|
| 201 |
+
|
| 202 |
+
title_col = "Title"
|
| 203 |
+
reference_col = "Reference"
|
| 204 |
+
subreddit_col = "Subreddit"
|
| 205 |
+
score_col = "Score"
|
| 206 |
+
comment_col = "Comments"
|
| 207 |
+
time_col = "Time"
|
| 208 |
+
author_col = "Author"
|
| 209 |
+
desc_col = "Description"
|
| 210 |
+
url_col = "Url"
|
| 211 |
+
|
| 212 |
+
if not any(c in df_raw.columns for c in [title_col, comment_col, desc_col]):
|
| 213 |
+
print("No text column detected. CSV columns:", list(df_raw.columns)); sys.exit(1)
|
| 214 |
+
|
| 215 |
+
# if title is None(not provided) entire column is filled with "" strings
|
| 216 |
+
# if title is provided but for some it is NaN after astype(str) they become "nan" not empty string
|
| 217 |
+
# normalized df
|
| 218 |
+
df = pd.DataFrame()
|
| 219 |
+
df["orig_index"] = df_raw.index.astype(str)
|
| 220 |
+
df["title"] = df_raw[title_col].fillna("").astype(str) if title_col else ""
|
| 221 |
+
df["reference"] = df_raw[reference_col].astype(str) if reference_col else ""
|
| 222 |
+
df["subreddit"] = df_raw[subreddit_col] if subreddit_col else "N/A"
|
| 223 |
+
df["raw_score"] = df_raw[score_col] if score_col else np.nan
|
| 224 |
+
df["comment"] = df_raw[comment_col].fillna("").astype(str) if comment_col else ""
|
| 225 |
+
df["time_raw"] = df_raw[time_col] if time_col else ""
|
| 226 |
+
df["username"] = df_raw[author_col] if author_col else "N/A"
|
| 227 |
+
df["description"] = df_raw[desc_col].fillna("").astype(str) if desc_col else ""
|
| 228 |
+
df["url"] = df_raw[url_col] if url_col else ""
|
| 229 |
+
|
| 230 |
+
df["text_for_analysis"] = (df["title"] + " " + df["comment"] + " " + df["description"]).str.strip()
|
| 231 |
+
df.loc[df["text_for_analysis"].str.strip() == "", "text_for_analysis"] = df.loc[df["text_for_analysis"].str.strip() == "", :].apply(
|
| 232 |
+
lambda r: " ".join([str(v) for v in r.values if isinstance(v, str) and v.strip() != ""]), axis=1
|
| 233 |
+
)
|
| 234 |
+
df["clean_text"] = df["text_for_analysis"].apply(clean_text)
|
| 235 |
+
df["score"] = df["raw_score"].apply(parse_score)
|
| 236 |
+
|
| 237 |
+
# parse times
|
| 238 |
+
try:
|
| 239 |
+
ref_ts = pd.to_datetime(os.path.getmtime(input_csv), unit='s')
|
| 240 |
+
except Exception:
|
| 241 |
+
ref_ts = pd.Timestamp.now()
|
| 242 |
+
|
| 243 |
+
df["created_at"] = df["time_raw"].apply(lambda x: parse_time_value(x,ref_ts))
|
| 244 |
+
|
| 245 |
+
# ---------------- SENTIMENT ----------------
|
| 246 |
+
print("Loading sentiment model...")
|
| 247 |
+
|
| 248 |
+
texts = df["clean_text"].tolist()
|
| 249 |
+
preds = []
|
| 250 |
+
batch_size = 32
|
| 251 |
+
for batch in chunked(texts, batch_size):
|
| 252 |
+
out = sentiment_model(batch, truncation=True)
|
| 253 |
+
for o in out:
|
| 254 |
+
label = o.get("label", "NEUTRAL")
|
| 255 |
+
score = float(o.get("score", 0.0))
|
| 256 |
+
preds.append((label, score))
|
| 257 |
+
|
| 258 |
+
df["sentiment"] = [p[0] for p in preds]
|
| 259 |
+
df["sentiment_score"] = [p[1] for p in preds]
|
| 260 |
+
# df["nature"] = df.apply(lambda r: determine_nature(r["clean_text"], r["sentiment"]), axis=1)
|
| 261 |
+
df["nature"] = [
|
| 262 |
+
determine_nature(text, sentiment)
|
| 263 |
+
for text, sentiment in zip(df["clean_text"], df["sentiment"])
|
| 264 |
+
]
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
# ---------------- TOPIC MODELING ----------------
|
| 268 |
+
print("Performing topic modeling...")
|
| 269 |
+
|
| 270 |
+
vectorizer = CountVectorizer(stop_words="english", min_df=2)
|
| 271 |
+
try:
|
| 272 |
+
X = vectorizer.fit_transform(df["clean_text"])
|
| 273 |
+
except Exception as e:
|
| 274 |
+
print("Topic vectorization failed:", e); X = None
|
| 275 |
+
|
| 276 |
+
if X is None or X.shape[0] < 3 or len(vectorizer.get_feature_names_out()) < 5:
|
| 277 |
+
df["topic"] = np.nan
|
| 278 |
+
topic_counts = pd.Series(dtype=int)
|
| 279 |
+
else:
|
| 280 |
+
n_topics = min(TOPIC_COUNT, X.shape[0])
|
| 281 |
+
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
|
| 282 |
+
lda.fit(X)
|
| 283 |
+
doc_topic = lda.transform(X)
|
| 284 |
+
df["topic"] = doc_topic.argmax(axis=1)
|
| 285 |
+
topic_counts = df["topic"].value_counts().sort_index()
|
| 286 |
+
|
| 287 |
+
df["dangerous"] = df.apply(lambda r: is_dangerous(r["clean_text"], r["sentiment"]), axis=1)
|
| 288 |
+
dangerous_tweets = df[df["dangerous"]].copy()
|
| 289 |
+
print(f"Flagged {len(dangerous_tweets)} potentially dangerous posts.")
|
| 290 |
+
|
| 291 |
+
# ---------------- VISUALS ----------------
|
| 292 |
+
try:
|
| 293 |
+
# sentiment plot
|
| 294 |
+
sent_counts = df["sentiment"].value_counts()
|
| 295 |
+
plt.figure(figsize=(6,4))
|
| 296 |
+
sent_counts.plot(kind="bar")
|
| 297 |
+
plt.title("Sentiment Distribution")
|
| 298 |
+
plt.tight_layout()
|
| 299 |
+
plt.savefig(out_dir / "sentiment.png", dpi=150)
|
| 300 |
+
plt.close()
|
| 301 |
+
# topic plot
|
| 302 |
+
if "topic" in df and df["topic"].notna().any():
|
| 303 |
+
topic_counts = df["topic"].value_counts().sort_index()
|
| 304 |
+
plt.figure(figsize=(6,4))
|
| 305 |
+
topic_counts.plot(kind="bar")
|
| 306 |
+
plt.title("Topic Distribution")
|
| 307 |
+
plt.tight_layout()
|
| 308 |
+
plt.savefig(out_dir / "topics.png", dpi=150)
|
| 309 |
+
plt.close()
|
| 310 |
+
# danger wordcloud
|
| 311 |
+
dangerous_df = df[df["dangerous"]]
|
| 312 |
+
if not dangerous_df.empty:
|
| 313 |
+
wc_text = " ".join(dangerous_df["clean_text"].tolist())
|
| 314 |
+
wc = WordCloud(width=1000, height=400, background_color="white", stopwords=set(STOPWORDS)).generate(wc_text)
|
| 315 |
+
plt.figure(figsize=(12,5))
|
| 316 |
+
plt.imshow(wc, interpolation="bilinear")
|
| 317 |
+
plt.axis("off")
|
| 318 |
+
plt.tight_layout()
|
| 319 |
+
plt.savefig(out_dir / "danger_wc.png", dpi=150)
|
| 320 |
+
plt.close()
|
| 321 |
+
except Exception as e:
|
| 322 |
+
logger.warning("Visuals generation failed: %s", e)
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
# ---------------- BUILD PDF ----------------
|
| 326 |
+
print("Building PDF report (LongTable for large tables)...")
|
| 327 |
+
pdf_out= out_dir/"report.pdf"
|
| 328 |
+
styles = getSampleStyleSheet()
|
| 329 |
+
styleN = styles["Normal"]
|
| 330 |
+
styleH = styles["Heading2"]
|
| 331 |
+
title_style = styles["Title"]
|
| 332 |
+
tweet_paragraph_style = ParagraphStyle("TweetStyle", parent=styles["BodyText"], fontSize=9, leading=11, spaceAfter=6, alignment=TA_LEFT)
|
| 333 |
+
|
| 334 |
+
doc = SimpleDocTemplate(pdf_out, pagesize=A4, rightMargin=36, leftMargin=36, topMargin=36, bottomMargin=36)
|
| 335 |
+
elements = []
|
| 336 |
+
elements.append(Paragraph("Reddit Posts Report (CSV Source) — India-specific Nature", title_style))
|
| 337 |
+
elements.append(Spacer(1, 8))
|
| 338 |
+
elements.append(Paragraph(f"Total Posts Processed: {len(df)}", styleN))
|
| 339 |
+
elements.append(Spacer(1, 8))
|
| 340 |
+
|
| 341 |
+
# Sentiment summary
|
| 342 |
+
elements.append(Paragraph("Sentiment Analysis Summary", styleH))
|
| 343 |
+
total = len(df)
|
| 344 |
+
for label, count in sent_counts.items():
|
| 345 |
+
pct = count / total * 100 if total > 0 else 0
|
| 346 |
+
elements.append(Paragraph(f"{label}: {count} posts ({pct:.1f}%)", styleN))
|
| 347 |
+
elements.append(Spacer(1, 6))
|
| 348 |
+
if os.path.exists("sentiment.png"):
|
| 349 |
+
elements.append(Image("sentiment.png", width=5.5*inch, height=3*inch))
|
| 350 |
+
elements.append(Spacer(1, 12))
|
| 351 |
+
|
| 352 |
+
# Topic & Nature summary
|
| 353 |
+
if not topic_counts.empty:
|
| 354 |
+
elements.append(Paragraph("Topic Modeling Summary", styleH))
|
| 355 |
+
for idx, val in topic_counts.items():
|
| 356 |
+
elements.append(Paragraph(f"Topic {int(idx)}: {int(val)} posts", styleN))
|
| 357 |
+
elements.append(Spacer(1, 6))
|
| 358 |
+
if os.path.exists("topics.png"): elements.append(Image("topics.png", width=5.5*inch, height=3*inch))
|
| 359 |
+
elements.append(Spacer(1, 12))
|
| 360 |
+
|
| 361 |
+
elements.append(Paragraph("Nature (India-specific) Summary", styleH))
|
| 362 |
+
nature_counts = df["nature"].value_counts()
|
| 363 |
+
for label, count in nature_counts.items():
|
| 364 |
+
pct = count / total * 100 if total > 0 else 0
|
| 365 |
+
elements.append(Paragraph(f"{label}: {count} posts ({pct:.1f}%)", styleN))
|
| 366 |
+
elements.append(Spacer(1, 12))
|
| 367 |
+
|
| 368 |
+
# Dangerous posts table (LongTable)
|
| 369 |
+
elements.append(Paragraph("Flagged Potentially Dangerous Posts", styleH))
|
| 370 |
+
elements.append(Spacer(1, 6))
|
| 371 |
+
if dangerous_tweets.empty:
|
| 372 |
+
elements.append(Paragraph("No dangerous posts detected.", styleN))
|
| 373 |
+
else:
|
| 374 |
+
# prepare LongTable data (header + rows)
|
| 375 |
+
header = ["Post (teaser)", "Subreddit", "Author", "Sentiment", "Nature", "Topic", "Date"]
|
| 376 |
+
lt_data = [header]
|
| 377 |
+
for _, row in dangerous_tweets.iterrows():
|
| 378 |
+
date_str = row["created_at"].strftime("%Y-%m-%d %H:%M") if pd.notna(row["created_at"]) else "N/A"
|
| 379 |
+
lt_data.append([
|
| 380 |
+
Paragraph(teaser(row["text_for_analysis"], TEASER_CHAR_LIMIT), tweet_paragraph_style),
|
| 381 |
+
row["subreddit"] if pd.notna(row["subreddit"]) else "N/A",
|
| 382 |
+
row["username"] if pd.notna(row["username"]) else "N/A",
|
| 383 |
+
row["sentiment"],
|
| 384 |
+
row["nature"],
|
| 385 |
+
str(int(row["topic"])) if not pd.isna(row["topic"]) else "N/A",
|
| 386 |
+
date_str
|
| 387 |
+
])
|
| 388 |
+
col_widths = [3.0*inch, 0.7*inch, 0.8*inch, 0.6*inch, 0.8*inch, 0.5*inch, 1.0*inch]
|
| 389 |
+
lt = LongTable(lt_data, colWidths=col_widths, repeatRows=1)
|
| 390 |
+
# style: small font, grid, header background
|
| 391 |
+
lt_style = TableStyle([
|
| 392 |
+
('BACKGROUND', (0,0), (-1,0), colors.HexColor("#4F81BD")),
|
| 393 |
+
('TEXTCOLOR', (0,0), (-1,0), colors.whitesmoke),
|
| 394 |
+
('ALIGN', (1,0), (-1,-1), 'CENTER'),
|
| 395 |
+
('VALIGN', (0,0), (-1,-1), 'TOP'),
|
| 396 |
+
('GRID', (0,0), (-1,-1), 0.25, colors.grey),
|
| 397 |
+
('FONTNAME', (0,0), (-1,0), 'Helvetica-Bold'),
|
| 398 |
+
('FONTSIZE', (0,0), (-1,-1), 8),
|
| 399 |
+
('LEFTPADDING', (0,0), (-1,-1), 4),
|
| 400 |
+
('RIGHTPADDING', (0,0), (-1,-1), 4),
|
| 401 |
+
])
|
| 402 |
+
lt.setStyle(lt_style)
|
| 403 |
+
elements.append(lt)
|
| 404 |
+
elements.append(Spacer(1, 12))
|
| 405 |
+
if os.path.exists("danger_wc.png"):
|
| 406 |
+
elements.append(Paragraph("Word Cloud of Flagged Posts", styleH)); elements.append(Image("danger_wc.png", width=5.5*inch, height=2.6*inch))
|
| 407 |
+
|
| 408 |
+
elements.append(PageBreak())
|
| 409 |
+
|
| 410 |
+
# All collected posts (LongTable) - include full dataset but use teaser to avoid huge cells
|
| 411 |
+
elements.append(Paragraph("All Collected Posts", styles['Heading2']))
|
| 412 |
+
all_header = ["Date", "Subreddit", "Author", "Score", "Nature", "Post (teaser)"]
|
| 413 |
+
all_lt_data = [all_header]
|
| 414 |
+
for idx, row in df.iterrows():
|
| 415 |
+
date_str = row["created_at"].strftime("%Y-%m-%d %H:%M") if pd.notna(row["created_at"]) else "N/A"
|
| 416 |
+
all_lt_data.append([
|
| 417 |
+
date_str,
|
| 418 |
+
row["subreddit"] if pd.notna(row["subreddit"]) else "N/A",
|
| 419 |
+
row["username"] if pd.notna(row["username"]) else "N/A",
|
| 420 |
+
str(row["score"]) if not pd.isna(row["score"]) else "N/A",
|
| 421 |
+
row["nature"],
|
| 422 |
+
Paragraph(teaser(row["text_for_analysis"], TEASER_CHAR_LIMIT), tweet_paragraph_style)
|
| 423 |
+
])
|
| 424 |
+
|
| 425 |
+
all_col_widths = [1.0*inch, 1.0*inch, 1.0*inch, 0.7*inch, 0.9*inch, 2.8*inch]
|
| 426 |
+
all_lt = LongTable(all_lt_data, colWidths=all_col_widths, repeatRows=1)
|
| 427 |
+
all_lt.setStyle(TableStyle([
|
| 428 |
+
('BACKGROUND', (0,0), (-1,0), colors.HexColor("#4F81BD")),
|
| 429 |
+
('TEXTCOLOR', (0,0), (-1,0), colors.whitesmoke),
|
| 430 |
+
('GRID', (0,0), (-1,-1), 0.25, colors.grey),
|
| 431 |
+
('VALIGN', (0,0), (-1,-1), 'TOP'),
|
| 432 |
+
('FONTSIZE', (0,0), (-1,-1), 8),
|
| 433 |
+
('LEFTPADDING', (0,0), (-1,-1), 4),
|
| 434 |
+
('RIGHTPADDING', (0,0), (-1,-1), 4),
|
| 435 |
+
]))
|
| 436 |
+
elements.append(all_lt)
|
| 437 |
+
|
| 438 |
+
# finish PDF
|
| 439 |
+
doc = SimpleDocTemplate(str(pdf_out))
|
| 440 |
+
doc.build(elements)
|
| 441 |
+
print("✅ PDF saved as:", pdf_out)
|
| 442 |
+
|
| 443 |
+
# ---------------- SAVE CSV (full enriched) ----------------
|
| 444 |
+
csv_out = out_dir/"analysis_output.csv"
|
| 445 |
+
df_out = df.copy()
|
| 446 |
+
df_out["created_at_str"] = df_out["created_at"].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S") if pd.notna(x) else "")
|
| 447 |
+
df_out.to_csv(csv_out, index=False, encoding="utf-8")
|
| 448 |
+
print("✅ Enriched CSV saved as:", csv_out)
|
| 449 |
+
|
| 450 |
+
|
| 451 |
+
# ---------------- DOCX EXPORT (optional) ----------------
|
| 452 |
+
if not DOCX_AVAILABLE:
|
| 453 |
+
print("python-docx not installed — skipping DOCX export. Install via: pip install python-docx")
|
| 454 |
+
else:
|
| 455 |
+
try:
|
| 456 |
+
print("Building DOCX report...")
|
| 457 |
+
DOCX_OUTPUT= out_dir/"report.docx"
|
| 458 |
+
docx = Document()
|
| 459 |
+
docx.add_heading("Reddit Posts Report (India-specific Nature)", level=1)
|
| 460 |
+
docx.add_paragraph(f"Total Posts Processed: {len(df)}")
|
| 461 |
+
docx.add_heading("Sentiment Analysis Summary", level=2)
|
| 462 |
+
for label, count in sent_counts.items():
|
| 463 |
+
pct = count / total * 100 if total > 0 else 0
|
| 464 |
+
docx.add_paragraph(f"{label}: {count} posts ({pct:.1f}%)")
|
| 465 |
+
|
| 466 |
+
docx.add_heading("Nature Summary", level=2)
|
| 467 |
+
for label, count in nature_counts.items():
|
| 468 |
+
pct = count / total * 100 if total > 0 else 0
|
| 469 |
+
docx.add_paragraph(f"{label}: {count} posts ({pct:.1f}%)")
|
| 470 |
+
|
| 471 |
+
# add small sample table (first 200 rows or less)
|
| 472 |
+
sample_n = min(200, len(df))
|
| 473 |
+
docx.add_heading(f"Sample of First {sample_n} Posts", level=2)
|
| 474 |
+
table = docx.add_table(rows=1, cols=6)
|
| 475 |
+
hdr_cells = table.rows[0].cells
|
| 476 |
+
hdr_cells[0].text = "Date"
|
| 477 |
+
hdr_cells[1].text = "Subreddit"
|
| 478 |
+
hdr_cells[2].text = "Author"
|
| 479 |
+
hdr_cells[3].text = "Score"
|
| 480 |
+
hdr_cells[4].text = "Nature"
|
| 481 |
+
hdr_cells[5].text = "Post (teaser)"
|
| 482 |
+
for idx, row in df.head(sample_n).iterrows():
|
| 483 |
+
row_cells = table.add_row().cells
|
| 484 |
+
date_str = row["created_at"].strftime("%Y-%m-%d %H:%M") if pd.notna(row["created_at"]) else "N/A"
|
| 485 |
+
row_cells[0].text = date_str
|
| 486 |
+
row_cells[1].text = str(row["subreddit"]) if pd.notna(row["subreddit"]) else "N/A"
|
| 487 |
+
row_cells[2].text = str(row["username"]) if pd.notna(row["username"]) else "N/A"
|
| 488 |
+
row_cells[3].text = str(row["score"]) if not pd.isna(row["score"]) else "N/A"
|
| 489 |
+
row_cells[4].text = str(row["nature"])
|
| 490 |
+
row_cells[5].text = teaser(row["text_for_analysis"], 300)
|
| 491 |
+
|
| 492 |
+
docx.save(DOCX_OUTPUT)
|
| 493 |
+
print("✅ DOCX saved as:", DOCX_OUTPUT)
|
| 494 |
+
except Exception as e:
|
| 495 |
+
logger.exception("DOCX creation failed: %s", e)
|
| 496 |
+
if DOCX_OUTPUT.exists():
|
| 497 |
+
try:
|
| 498 |
+
DOCX_OUTPUT.unlink(missing_ok=True)
|
| 499 |
+
except Exception:
|
| 500 |
+
pass
|
| 501 |
+
logger.info("Processor: finished, files at %s", out_dir)
|
| 502 |
+
return {"pdf": str(pdf_out), "csv": str(csv_out), "docx": str(DOCX_OUTPUT) if DOCX_OUTPUT.exists() else ""}
|
| 503 |
+
|
reddit_scrapper.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import csv
|
| 3 |
+
import time
|
| 4 |
+
import logging
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from datetime import datetime, timezone
|
| 7 |
+
from typing import Iterable, List, Optional
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
|
| 10 |
+
import praw
|
| 11 |
+
import prawcore
|
| 12 |
+
import pytz
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger("reddit_scraper")
|
| 15 |
+
logger.setLevel(logging.INFO)
|
| 16 |
+
|
| 17 |
+
load_dotenv()
|
| 18 |
+
|
| 19 |
+
# default queries (copied from your Selenium version)
|
| 20 |
+
political_queries: List[str] = [
|
| 21 |
+
"india politics",
|
| 22 |
+
"india protest",
|
| 23 |
+
"india government fail",
|
| 24 |
+
"india corruption",
|
| 25 |
+
"india democracy threat",
|
| 26 |
+
"india dictatorship",
|
| 27 |
+
"india religious violence",
|
| 28 |
+
"india communal riots",
|
| 29 |
+
"india anti muslim",
|
| 30 |
+
"india anti sikh",
|
| 31 |
+
"india caste violence",
|
| 32 |
+
"india hate speech",
|
| 33 |
+
"india freedom struggle",
|
| 34 |
+
"india human rights violation",
|
| 35 |
+
"india farmers protest",
|
| 36 |
+
"india caa protest",
|
| 37 |
+
"india nrc protest",
|
| 38 |
+
"india modi resign",
|
| 39 |
+
"india bjp fail",
|
| 40 |
+
"india rss agenda",
|
| 41 |
+
"india fake news",
|
| 42 |
+
"india propaganda",
|
| 43 |
+
"india media blackout",
|
| 44 |
+
"boycott india",
|
| 45 |
+
"boycott indian products",
|
| 46 |
+
"boycott bollywood",
|
| 47 |
+
"kashmir freedom",
|
| 48 |
+
"kashmir human rights",
|
| 49 |
+
"kashmir india occupation",
|
| 50 |
+
"kashmir protest",
|
| 51 |
+
"khalistan movement",
|
| 52 |
+
"punjab separatism",
|
| 53 |
+
"anti national india",
|
| 54 |
+
"down with india",
|
| 55 |
+
"stop india aggression",
|
| 56 |
+
"india pakistan conflict",
|
| 57 |
+
"china india border",
|
| 58 |
+
"india brutality",
|
| 59 |
+
"india minority oppression"
|
| 60 |
+
]
|
| 61 |
+
|
| 62 |
+
def _init_reddit():
|
| 63 |
+
"""Initialize a PRAW Reddit instance using environment variables."""
|
| 64 |
+
client_id = os.environ.get("REDDIT_CLIENT_ID")
|
| 65 |
+
client_secret = os.environ.get("REDDIT_CLIENT_SECRET")
|
| 66 |
+
user_agent = os.environ.get("REDDIT_USER_AGENT", "reddit_scraper:v1.0")
|
| 67 |
+
|
| 68 |
+
logger.info(f"Initializing Reddit with ClientID: {client_id}, Agent: {user_agent}")
|
| 69 |
+
|
| 70 |
+
if not client_id or not client_secret:
|
| 71 |
+
logger.error("Missing REDDIT_CLIENT_ID or REDDIT_CLIENT_SECRET env vars")
|
| 72 |
+
raise EnvironmentError(
|
| 73 |
+
"REDDIT_CLIENT_ID and REDDIT_CLIENT_SECRET must be set as environment variables."
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
return praw.Reddit(
|
| 77 |
+
client_id=client_id,
|
| 78 |
+
client_secret=client_secret,
|
| 79 |
+
user_agent=user_agent,
|
| 80 |
+
check_for_async=False # prevents accidental async loop issues
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
def _format_time(created_utc: Optional[float]) -> str:
|
| 84 |
+
"""Return timestamp string in UTC 'YYYY-MM-DD HH:MM:SS' (fallback 'N/A')."""
|
| 85 |
+
if not created_utc:
|
| 86 |
+
return "N/A"
|
| 87 |
+
# use UTC time for consistency
|
| 88 |
+
dt = datetime.fromtimestamp(created_utc, tz=timezone.utc)
|
| 89 |
+
return dt.strftime("%Y-%m-%d %H:%M:%S")
|
| 90 |
+
|
| 91 |
+
def scrape_reddit_to_csv(
|
| 92 |
+
output_csv_path: str,
|
| 93 |
+
per_query_limit: int,
|
| 94 |
+
total_limit: int,
|
| 95 |
+
delay_between_queries: float = 1.5
|
| 96 |
+
) -> int:
|
| 97 |
+
"""
|
| 98 |
+
Scrape reddit using PRAW and save results to output_csv_path.
|
| 99 |
+
- per_query_limit: max results to request per query (PRAW will respect rate limits)
|
| 100 |
+
- total_limit: overall cap on number of rows written
|
| 101 |
+
- returns: number of rows written
|
| 102 |
+
"""
|
| 103 |
+
|
| 104 |
+
try:
|
| 105 |
+
reddit = _init_reddit()
|
| 106 |
+
logger.info(f"Reddit instance created. Read-only: {reddit.read_only}")
|
| 107 |
+
except Exception as e:
|
| 108 |
+
logger.exception(f"Failed to init reddit: {e}")
|
| 109 |
+
raise
|
| 110 |
+
|
| 111 |
+
Path(output_csv_path).parent.mkdir(parents=True, exist_ok=True)
|
| 112 |
+
logger.info("Running PRAW scraper and saving CSV to %s", output_csv_path)
|
| 113 |
+
|
| 114 |
+
written = 0
|
| 115 |
+
seen_ids = set()
|
| 116 |
+
|
| 117 |
+
header = ["Title", "Reference", "Score", "Comments", "Time", "Author", "Subreddit", "Description", "Url"]
|
| 118 |
+
|
| 119 |
+
with open(output_csv_path, "w", newline="", encoding="utf-8") as fh:
|
| 120 |
+
writer = csv.writer(fh)
|
| 121 |
+
writer.writerow(header)
|
| 122 |
+
|
| 123 |
+
try:
|
| 124 |
+
for query in political_queries:
|
| 125 |
+
if written >= total_limit:
|
| 126 |
+
logger.info("Reached total_limit=%s, stopping.", total_limit)
|
| 127 |
+
break
|
| 128 |
+
|
| 129 |
+
logger.info("Searching Reddit for query: %s (limit=%s)", query, per_query_limit)
|
| 130 |
+
try:
|
| 131 |
+
# search on r/all
|
| 132 |
+
submissions = reddit.subreddit("all").search(query, sort="new", limit=per_query_limit)
|
| 133 |
+
# Force a generator fetch to check for immediate auth errors
|
| 134 |
+
# submissions = list(submissions)
|
| 135 |
+
except prawcore.exceptions.RequestException as e:
|
| 136 |
+
logger.warning("Network error during PRAW search for '%s': %s", query, e)
|
| 137 |
+
time.sleep(2)
|
| 138 |
+
continue
|
| 139 |
+
except Exception as e:
|
| 140 |
+
logger.exception("PRAW search failed for '%s': %s", query, e)
|
| 141 |
+
time.sleep(2)
|
| 142 |
+
continue
|
| 143 |
+
|
| 144 |
+
keywords = [kw.lower() for kw in query.split() if kw.strip()]
|
| 145 |
+
|
| 146 |
+
for sub in submissions:
|
| 147 |
+
if written >= total_limit:
|
| 148 |
+
break
|
| 149 |
+
|
| 150 |
+
try:
|
| 151 |
+
sid = getattr(sub, "id", None)
|
| 152 |
+
if not sid:
|
| 153 |
+
continue
|
| 154 |
+
if sid in seen_ids:
|
| 155 |
+
continue
|
| 156 |
+
seen_ids.add(sid)
|
| 157 |
+
|
| 158 |
+
title = getattr(sub, "title", "") or ""
|
| 159 |
+
reference = sid
|
| 160 |
+
score = getattr(sub, "score", 0) or 0
|
| 161 |
+
comments = getattr(sub, "num_comments", 0) or 0
|
| 162 |
+
created = _format_time(getattr(sub, "created_utc", None))
|
| 163 |
+
author = getattr(sub.author, "name", "deleted") if getattr(sub, "author", None) else "deleted"
|
| 164 |
+
subreddit = getattr(sub.subreddit, "display_name", "") or ""
|
| 165 |
+
description = getattr(sub, "selftext", "") or ""
|
| 166 |
+
url = getattr(sub, "url", "") or ""
|
| 167 |
+
|
| 168 |
+
# replicate the original filtering: ensure query keywords appear in title or description
|
| 169 |
+
text_for_check = f"{title} {description}".lower()
|
| 170 |
+
if keywords and not any(kw in text_for_check for kw in keywords):
|
| 171 |
+
# skip items that don't appear relevant
|
| 172 |
+
continue
|
| 173 |
+
|
| 174 |
+
writer.writerow([title, reference, score, comments, created, author, subreddit, description, url])
|
| 175 |
+
written += 1
|
| 176 |
+
|
| 177 |
+
except Exception as e:
|
| 178 |
+
# don't stop the whole scraper for one failing submission
|
| 179 |
+
logger.exception("Failed to process submission %s: %s", getattr(sub, "id", "<no-id>"), e)
|
| 180 |
+
continue
|
| 181 |
+
|
| 182 |
+
# respectful delay between queries to reduce risk of rate limiting
|
| 183 |
+
time.sleep(delay_between_queries)
|
| 184 |
+
|
| 185 |
+
except KeyboardInterrupt:
|
| 186 |
+
logger.warning("Scraper interrupted by user.")
|
| 187 |
+
except Exception as e:
|
| 188 |
+
logger.exception("Unhandled exception during scraping: %s", e)
|
| 189 |
+
|
| 190 |
+
logger.info("Scraper finished: wrote %d rows to %s", written, output_csv_path)
|
| 191 |
+
return written
|
| 192 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn
|
| 3 |
+
|
| 4 |
+
pandas
|
| 5 |
+
numpy
|
| 6 |
+
scikit-learn
|
| 7 |
+
|
| 8 |
+
matplotlib
|
| 9 |
+
wordcloud
|
| 10 |
+
reportlab
|
| 11 |
+
python-docx
|
| 12 |
+
|
| 13 |
+
praw
|
| 14 |
+
requests
|
| 15 |
+
python-dotenv
|
| 16 |
+
|
| 17 |
+
transformers
|
| 18 |
+
torch
|
| 19 |
+
tokenizers
|
| 20 |
+
|
| 21 |
+
tqdm
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# absl-py==2.3.1
|
| 39 |
+
# annotated-types==0.7.0
|
| 40 |
+
# anyio==4.10.0
|
| 41 |
+
# astunparse==1.6.3
|
| 42 |
+
# attrs==25.3.0
|
| 43 |
+
# certifi==2025.8.3
|
| 44 |
+
# cffi==1.17.1
|
| 45 |
+
# charset-normalizer==3.4.3
|
| 46 |
+
# click==8.2.1
|
| 47 |
+
# colorama==0.4.6
|
| 48 |
+
# contourpy==1.3.3
|
| 49 |
+
# cycler==0.12.1
|
| 50 |
+
# fastapi==0.116.1
|
| 51 |
+
# filelock==3.19.1
|
| 52 |
+
# flatbuffers==25.2.10
|
| 53 |
+
# fonttools==4.59.2
|
| 54 |
+
# fsspec==2025.7.0
|
| 55 |
+
# gast==0.6.0
|
| 56 |
+
# google-pasta==0.2.0
|
| 57 |
+
# grpcio==1.74.0
|
| 58 |
+
# h11==0.16.0
|
| 59 |
+
# h5py==3.14.0
|
| 60 |
+
# huggingface-hub==0.34.4
|
| 61 |
+
# idna==3.10
|
| 62 |
+
# Jinja2==3.1.4
|
| 63 |
+
# joblib==1.5.2
|
| 64 |
+
# kiwisolver==1.4.9
|
| 65 |
+
# libclang==18.1.1
|
| 66 |
+
# lxml==6.0.1
|
| 67 |
+
# Markdown==3.8.2
|
| 68 |
+
# markdown-it-py==4.0.0
|
| 69 |
+
# matplotlib==3.10.8
|
| 70 |
+
# mdurl==0.1.2
|
| 71 |
+
# ml_dtypes==0.5.3
|
| 72 |
+
# mpmath==1.3.0
|
| 73 |
+
# namex==0.1.0
|
| 74 |
+
# networkx==3.3
|
| 75 |
+
# numpy==2.3.2
|
| 76 |
+
# opt_einsum==3.4.0
|
| 77 |
+
# optree==0.17.0
|
| 78 |
+
# outcome==1.3.0.post0
|
| 79 |
+
# packaging==25.0
|
| 80 |
+
# pandas==2.3.2
|
| 81 |
+
# pillow==12.1.0
|
| 82 |
+
# praw==7.8.1
|
| 83 |
+
# prawcore==2.4.0
|
| 84 |
+
# protobuf==6.32.0
|
| 85 |
+
# pycparser==2.22
|
| 86 |
+
# pydantic==2.11.7
|
| 87 |
+
# pydantic_core==2.33.2
|
| 88 |
+
# Pygments==2.19.2
|
| 89 |
+
# pyparsing==3.2.3
|
| 90 |
+
# PySocks==1.7.1
|
| 91 |
+
# python-dateutil==2.9.0.post0
|
| 92 |
+
# python-docx==1.2.0
|
| 93 |
+
# python-dotenv==1.2.1
|
| 94 |
+
# pytz==2025.2
|
| 95 |
+
# PyYAML==6.0.2
|
| 96 |
+
# regex==2025.8.29
|
| 97 |
+
# reportlab==4.4.3
|
| 98 |
+
# requests==2.32.5
|
| 99 |
+
# rich==14.1.0
|
| 100 |
+
# safetensors==0.6.2
|
| 101 |
+
# scikit-learn==1.7.1
|
| 102 |
+
# scipy==1.16.1
|
| 103 |
+
# selenium==4.35.0
|
| 104 |
+
# setuptools==80.9.0
|
| 105 |
+
# six==1.17.0
|
| 106 |
+
# sniffio==1.3.1
|
| 107 |
+
# sortedcontainers==2.4.0
|
| 108 |
+
# starlette==0.47.3
|
| 109 |
+
# sympy==1.13.3
|
| 110 |
+
# tensorboard==2.20.0
|
| 111 |
+
# tensorboard-data-server==0.7.2
|
| 112 |
+
# termcolor==3.1.0
|
| 113 |
+
# threadpoolctl==3.6.0
|
| 114 |
+
# tokenizers==0.22.0
|
| 115 |
+
# torch==2.8.0+cpu
|
| 116 |
+
# torchaudio==2.8.0+cpu
|
| 117 |
+
# torchvision==0.23.0+cpu
|
| 118 |
+
# tqdm==4.67.1
|
| 119 |
+
# transformers==4.56.0
|
| 120 |
+
# trio==0.30.0
|
| 121 |
+
# trio-websocket==0.12.2
|
| 122 |
+
# typing-inspection==0.4.1
|
| 123 |
+
# typing_extensions==4.15.0
|
| 124 |
+
# tzdata==2025.2
|
| 125 |
+
# update-checker==0.18.0
|
| 126 |
+
# urllib3==2.5.0
|
| 127 |
+
# uvicorn==0.35.0
|
| 128 |
+
# websocket-client==1.8.0
|
| 129 |
+
# Werkzeug==3.1.3
|
| 130 |
+
# wheel==0.45.1
|
| 131 |
+
# wordcloud==1.9.4
|
| 132 |
+
# wrapt==1.17.3
|
| 133 |
+
# wsproto==1.2.0
|