WizardCoder2007 commited on
Commit
1e36e31
·
1 Parent(s): 8892a43
Files changed (6) hide show
  1. .gitignore +18 -0
  2. Dockerfile +25 -0
  3. main.py +326 -0
  4. processor.py +503 -0
  5. reddit_scrapper.py +192 -0
  6. requirements.txt +133 -0
.gitignore ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ venv/
2
+ .venv/
3
+ __pycache__/
4
+ *.pyc
5
+ .env
6
+ .env.local
7
+ .env.example
8
+ .env.debug
9
+ .env.production
10
+ .env.test
11
+ .env.development
12
+
13
+ storage/latest/
14
+ *.png
15
+ *.pdf
16
+ *.csv
17
+ *.json
18
+ *.docx
Dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1
4
+ ENV PYTHONUNBUFFERED=1
5
+
6
+ WORKDIR /app
7
+
8
+ # System deps (for matplotlib, reportlab, wordcloud)
9
+ RUN apt-get update && apt-get install -y \
10
+ build-essential \
11
+ gcc \
12
+ libglib2.0-0 \
13
+ libsm6 \
14
+ libxext6 \
15
+ libxrender1 \
16
+ && rm -rf /var/lib/apt/lists/*
17
+
18
+ COPY requirements.txt .
19
+ RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt
20
+
21
+ COPY . .
22
+
23
+ EXPOSE 7860
24
+
25
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
main.py ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests,time,csv,re,json,sys,math,random,io
2
+ import uuid,shutil,logging,os
3
+ from pathlib import Path
4
+ from typing import Optional,Tuple
5
+ from datetime import datetime, timezone,timedelta
6
+
7
+ from fastapi import FastAPI, Query, HTTPException, Header, BackgroundTasks, Request, Response
8
+ from fastapi.responses import HTMLResponse, JSONResponse,StreamingResponse,FileResponse
9
+ from fastapi.middleware.cors import CORSMiddleware
10
+ from starlette.concurrency import run_in_threadpool
11
+ from pydantic import BaseModel
12
+ from typing import Literal
13
+
14
+ try:
15
+ import processor
16
+ except Exception as e:
17
+ raise RuntimeError(f"Failed to import processor.py: {e}")
18
+
19
+ from reddit_scrapper import scrape_reddit_to_csv
20
+
21
+ # try import python-docx (optional)
22
+ DOCX_AVAILABLE = True
23
+ try:
24
+ from docx import Document
25
+ from docx.shared import Inches
26
+ except Exception:
27
+ DOCX_AVAILABLE = False
28
+
29
+ class RerunRequest(BaseModel):
30
+ intent: Literal["light", "medium", "deep"]
31
+
32
+ INTENT_LIMITS = {
33
+ "light": {"per_query": 20, "total": 40},
34
+ "medium": {"per_query": 50, "total": 300},
35
+ "deep": {"per_query": 100, "total": 800},
36
+ }
37
+
38
+ # ---- Configuration ----
39
+ BASE_DIR= Path(__file__).resolve().parent
40
+ STORAGE_DIR= BASE_DIR/"storage"
41
+ LATEST_DIR= STORAGE_DIR/"latest"
42
+ STORAGE_DIR.mkdir(exist_ok=True)
43
+ LATEST_DIR.mkdir(exist_ok=True)
44
+
45
+ # API key (optional) if set in env required for post/rerun
46
+ API_KEY= os.environ.get("API_KEY",None)
47
+
48
+ # logging
49
+ logging.basicConfig(level=logging.INFO)
50
+ logger= logging.getLogger("report-saver")
51
+
52
+ # FastAPI code
53
+ app= FastAPI(title="Auto Report API (CSV → PDF/DOCX)")
54
+
55
+ # CORS allow all in dev, restrict in production
56
+ origins=[
57
+ "https://ciis-indol.vercel.app",
58
+ "http://localhost:8080",
59
+ "http://127.0.0.1:8080",
60
+ "http://localhost:5173",
61
+ "http://127.0.0.1:5173",
62
+ "http://localhost:8000",
63
+ "http://127.0.0.1:8000"
64
+ ]
65
+
66
+ app.add_middleware(CORSMiddleware, allow_origins=origins,allow_credentials=True, allow_methods=["*"],allow_headers=["*"])
67
+
68
+ # Helper: safe path join inside storage using Path
69
+ def storage_path(filename:str)-> Path:
70
+ return LATEST_DIR/filename
71
+
72
+ def scrape_live_data(output_csv_path:str, per_query: int, total:int)->None:
73
+ scrape_reddit_to_csv(output_csv_path,per_query,total)
74
+
75
+ # ------------------------------
76
+ # Range-supporting file response for large files (PDF preview)
77
+ def get_range_byte_positions(range_header: str, file_size: int) -> Optional[Tuple[int, int]]:
78
+ # Example Range header: 'bytes=0-1023' or 'bytes=1024-'
79
+ if not range_header:
80
+ return None
81
+ header = range_header.strip()
82
+ if not header.startswith("bytes="):
83
+ return None
84
+ range_val = header.split("=", 1)[1]
85
+ parts = range_val.split("-")
86
+ try:
87
+ if parts[0] == "":
88
+ # suffix bytes: '-N' -> last N bytes
89
+ end = file_size - 1
90
+ start = file_size - int(parts[1])
91
+ elif parts[1] == "":
92
+ # 'start-' to end of file
93
+ start = int(parts[0])
94
+ end = file_size - 1
95
+ else:
96
+ start = int(parts[0])
97
+ end = int(parts[1])
98
+ if start < 0:
99
+ start = 0
100
+ if end >= file_size:
101
+ end = file_size - 1
102
+ if start > end:
103
+ return None
104
+ return (start, end)
105
+ except Exception:
106
+ return None
107
+
108
+ def range_stream_response(path: Path, request: Request) -> StreamingResponse:
109
+ """Return a StreamingResponse that honors Range requests for a file."""
110
+ file_size = path.stat().st_size
111
+ range_header = request.headers.get("range")
112
+ range_pos = get_range_byte_positions(range_header, file_size)
113
+ headers = {
114
+ "Accept-Ranges": "bytes",
115
+ "Content-Type": "application/octet-stream",
116
+ "Content-Disposition": f'inline; filename="{path.name}"',
117
+ }
118
+
119
+ if range_pos is None:
120
+ # full content
121
+ def iterfile():
122
+ with open(path, "rb") as f:
123
+ while True:
124
+ chunk = f.read(1024 * 1024)
125
+ if not chunk:
126
+ break
127
+ yield chunk
128
+ headers["Content-Length"] = str(file_size)
129
+ return StreamingResponse(iterfile(), status_code=200, headers=headers)
130
+ else:
131
+ start, end = range_pos
132
+ length = end - start + 1
133
+ headers["Content-Length"] = str(length)
134
+ headers["Content-Range"] = f"bytes {start}-{end}/{file_size}"
135
+ # status 206 Partial Content
136
+ def iterfile_range():
137
+ with open(path, "rb") as f:
138
+ f.seek(start)
139
+ remaining = length
140
+ chunk_size = 1024 * 1024
141
+ while remaining > 0:
142
+ to_read = min(chunk_size, remaining)
143
+ chunk = f.read(to_read)
144
+ if not chunk:
145
+ break
146
+ remaining -= len(chunk)
147
+ yield chunk
148
+ return StreamingResponse(iterfile_range(), status_code=206, headers=headers)
149
+
150
+
151
+ @app.get("/")
152
+ def home():
153
+ return {"message":"sever working"}
154
+
155
+ @app.post("/rerun")
156
+ async def rerun_endpoint(body: RerunRequest, x_api_key: Optional[str] = Header(None)):
157
+ """
158
+ Trigger live scraping + processing.
159
+ Optional x-api-key header if API_KEY is set in env.
160
+ This endpoint blocks until processing completes and returns file paths.
161
+ """
162
+ # auth check
163
+ if API_KEY:
164
+ if not x_api_key or x_api_key != API_KEY:
165
+ logger.warning("Rejected rerun: invalid API key")
166
+ raise HTTPException(status_code=401, detail="Invalid or missing x-api-key")
167
+
168
+ # create a new working folder
169
+ # uid = uuid.uuid4().hex
170
+ work_dir = STORAGE_DIR / "latest"
171
+ work_dir.mkdir(parents=True, exist_ok=True)
172
+
173
+ # step 1: scrape live data -> create input CSV path
174
+ input_csv = work_dir / "scraped_input.csv"
175
+ limits= INTENT_LIMITS[body.intent]
176
+ logger.info(f"Received rerun request. Intent: {body.intent}, Limits: {limits}")
177
+
178
+ try:
179
+ logger.info(f"Starting scraping to {input_csv}...")
180
+ scrape_live_data(str(input_csv),int(limits["per_query"]),int(limits["total"]))
181
+ logger.info("Scraping completed successfully.")
182
+ except Exception as e:
183
+ logger.exception("Scraping failed: %s", e)
184
+ raise HTTPException(status_code=500, detail=f"Scraping failed: {e}")
185
+
186
+ # step 2: process csv into pdf, docx, analysis_output.csv
187
+ try:
188
+ logger.info("Calling user-provided processor.generate_reports_from_csv")
189
+ # assume processor writes to out_dir and returns dict or nothing
190
+ out = processor.generate_reports_from_csv(str(input_csv), str(work_dir))
191
+ logger.info(f"Processing return value: {out}")
192
+
193
+ # normalize result
194
+ pdf_path = str(work_dir / "report.pdf")
195
+ csv_path = str(work_dir / "analysis_output.csv")
196
+ docx_path = str(work_dir / "report.docx")
197
+ # if processor returned explicit paths, use them
198
+ if isinstance(out, dict):
199
+ pdf_path = out.get("pdf", pdf_path)
200
+ csv_path = out.get("csv", csv_path)
201
+ docx_path = out.get("docx", docx_path)
202
+ result = {"pdf": pdf_path, "csv": csv_path, "docx": docx_path}
203
+ except Exception as e:
204
+ logger.exception("Processing failed: %s", e)
205
+ raise HTTPException(status_code=500, detail=f"Processing failed: {e}")
206
+
207
+ # step 3: update 'latest' storage (atomically)
208
+ try:
209
+ # clear latest directory
210
+ # if LATEST_DIR.exists():
211
+ # shutil.rmtree(LATEST_DIR)
212
+ LATEST_DIR.mkdir(parents=True, exist_ok=True)
213
+ # Define IST timezone
214
+ IST = timezone(timedelta(hours=5, minutes=30))
215
+ generated_at = datetime.now(IST).strftime("%Y-%m-%d %H:%M:%S")
216
+ # write metadata file
217
+ meta = {
218
+ "pdf": "/files/report.pdf" if (LATEST_DIR / "report.pdf").exists() else "",
219
+ "csv": "/files/analysis_output.csv" if (LATEST_DIR / "analysis_output.csv").exists() else "",
220
+ "docx": "/files/report.docx" if (LATEST_DIR / "report.docx").exists() else "",
221
+ "generated_at": generated_at,
222
+ }
223
+
224
+ # write meta to disk for persistence
225
+ with open(LATEST_DIR / "meta.json", "w", encoding="utf-8") as mf:
226
+ import json
227
+ json.dump(meta, mf)
228
+
229
+ except Exception as e:
230
+ logger.exception("Failed to update latest storage: %s", e)
231
+ raise HTTPException(status_code=500, detail=f"Failed to update latest storage: {e}")
232
+
233
+ logger.info("Rerun completed, files available under latest/ directory")
234
+ return JSONResponse(status_code=200, content={
235
+ "status": "ok",
236
+ "pdf": meta["pdf"],
237
+ "csv": meta["csv"],
238
+ "docx": meta["docx"]
239
+ })
240
+
241
+
242
+ @app.get("/report")
243
+ async def get_report():
244
+ """
245
+ Return metadata about current report (pdf/csv/docx)
246
+ """
247
+ meta_file = LATEST_DIR / "meta.json"
248
+ if not meta_file.exists():
249
+ raise HTTPException(status_code=404, detail="No report available yet")
250
+ import json
251
+ with open(meta_file, "r", encoding="utf-8") as f:
252
+ meta = json.load(f)
253
+ return JSONResponse(status_code=200, content=meta)
254
+
255
+ @app.get("/pdf/view/{filename}")
256
+ async def view_pdf(filename: str):
257
+ path = LATEST_DIR / filename
258
+ if not path.exists():
259
+ raise HTTPException(404, "File not found")
260
+
261
+ return FileResponse(
262
+ path,
263
+ media_type="application/pdf",
264
+ headers={
265
+ "Content-Disposition": f'inline; filename="{path.name}"'
266
+ }
267
+ )
268
+
269
+ @app.get("/pdf/download/{filename}")
270
+ async def download_pdf(filename: str):
271
+ path = LATEST_DIR / filename
272
+ if not path.exists():
273
+ raise HTTPException(404, "File not found")
274
+
275
+ return FileResponse(
276
+ path,
277
+ media_type="application/pdf",
278
+ headers={
279
+ "Content-Disposition": f'attachment; filename="{path.name}"'
280
+ }
281
+ )
282
+
283
+
284
+ @app.get("/files/{filename}")
285
+ async def serve_file(filename: str, request: Request):
286
+ """
287
+ Serve files from the latest directory. Supports Range requests (for PDFs).
288
+ """
289
+ safe_name = os.path.basename(filename)
290
+ path = LATEST_DIR / safe_name
291
+ if not path.exists() or not path.is_file():
292
+ raise HTTPException(status_code=404, detail="File not found")
293
+ # Detect file type
294
+ if path.suffix.lower() == ".pdf":
295
+ media_type = "application/pdf"
296
+ elif path.suffix.lower() == ".csv":
297
+ media_type = "text/csv"
298
+ elif path.suffix.lower() == ".docx":
299
+ media_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
300
+ else:
301
+ media_type = "application/octet-stream"
302
+
303
+ # if the client supports Range (commonly for PDFs), use range_stream_response
304
+ range_header = request.headers.get("range")
305
+ if range_header and path.suffix.lower() == ".pdf":
306
+ return range_stream_response(path, request)
307
+ else:
308
+ # full file streaming
309
+ def file_iterator():
310
+ with open(path, "rb") as f:
311
+ while True:
312
+ chunk = f.read(1024 * 1024)
313
+ if not chunk:
314
+ break
315
+ yield chunk
316
+ headers = {
317
+ "Content-Disposition": f'inline; filename="{path.name}"',
318
+ "Content-Length": str(path.stat().st_size),
319
+ }
320
+ return StreamingResponse(file_iterator(), media_type=media_type, headers=headers)
321
+
322
+ if __name__=='__main__':
323
+ import uvicorn
324
+ port= int(os.environ.get("PORT",8000))
325
+ logger.info("Starting on port %s",port)
326
+ uvicorn.run("main:app",host="0.0.0.0",port=port,log_level="info")
processor.py ADDED
@@ -0,0 +1,503 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Updated: supports large tables using LongTable + docx export.
3
+ Processor module.
4
+ Expose: generate_reports_from_csv(input_csv: str, out_dir: str) -> dict
5
+ Produces: out_dir/analysis_output.csv, out_dir/report.pdf, out_dir/report.docx (optional)
6
+ """
7
+
8
+ import os,re,sys,csv,logging
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+ import pandas as pd
12
+ import numpy as np
13
+ import matplotlib.pyplot as plt
14
+ from wordcloud import WordCloud, STOPWORDS
15
+ from transformers import pipeline
16
+ from sklearn.feature_extraction.text import CountVectorizer
17
+ from sklearn.decomposition import LatentDirichletAllocation
18
+
19
+ # reportlab platypus
20
+ from reportlab.platypus import (SimpleDocTemplate, Paragraph, Spacer, PageBreak,
21
+ TableStyle, Image, LongTable)
22
+ from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
23
+ from reportlab.lib import colors
24
+ from reportlab.lib.pagesizes import A4
25
+ from reportlab.lib.units import inch
26
+ from reportlab.lib.enums import TA_LEFT
27
+
28
+ # try import python-docx (optional)
29
+ DOCX_AVAILABLE = True
30
+ try:
31
+ from docx import Document
32
+ from docx.shared import Inches
33
+ except Exception:
34
+ DOCX_AVAILABLE = False
35
+
36
+ logger = logging.getLogger("processor")
37
+ logger.setLevel(logging.INFO)
38
+
39
+ # ---------------- CONFIG ----------------
40
+ CSV_ENCODING = "utf-8"
41
+ MAX_ROWS = None # None => all rows
42
+ TOPIC_COUNT = 3
43
+
44
+ # Table teaser length to avoid massive single-cell height in PDF tables
45
+ TEASER_CHAR_LIMIT = 900
46
+
47
+ # ---------------- UTIL ----------------
48
+ RELATIVE_TIME_RE = re.compile(
49
+ r'(?:(\d+)\s*(second|sec|s|minute|min|m|hour|hr|h|day|d|week|w|month|mo|year|yr|y)s?\s*ago)|\b(yesterday|today|just now|now)\b',
50
+ flags=re.IGNORECASE
51
+ )
52
+
53
+ try:
54
+ import torch
55
+ device = 0 if torch.cuda.is_available() else -1
56
+ except Exception:
57
+ device = -1
58
+
59
+ try:
60
+ sentiment_model = pipeline("sentiment-analysis",
61
+ model="distilbert-base-uncased-finetuned-sst-2-english",
62
+ device=device)
63
+ except Exception as e:
64
+ print("Failed to load requested model:", e)
65
+ try:
66
+ sentiment_model = pipeline("sentiment-analysis", device=device)
67
+ except Exception as ex:
68
+ print("Final sentiment pipeline fallback failed:", ex); sys.exit(1)
69
+
70
+
71
+ def parse_relative_time(s: str, ref: pd.Timestamp):
72
+ if not isinstance(s, str) or s.strip() == "":
73
+ return pd.NaT
74
+ s = s.strip().lower()
75
+ if s in ("just now", "now"):
76
+ return ref
77
+ if s == "today":
78
+ return pd.Timestamp(ref.date())
79
+ if s == "yesterday":
80
+ return ref - pd.Timedelta(days=1)
81
+ s = re.sub(r'\b(an|a)\b', '1', s)
82
+ m = re.search(r'(\d+)\s*(second|sec|s|minute|min|m|hour|hr|h|day|d|week|w|month|mo|year|yr|y)s?\s*ago', s)
83
+ if not m:
84
+ return pd.NaT
85
+ qty = int(m.group(1)); unit = m.group(2).lower()
86
+ if unit in ("second","sec","s"): return ref - pd.Timedelta(seconds=qty)
87
+ if unit in ("minute","min","m"): return ref - pd.Timedelta(minutes=qty)
88
+ if unit in ("hour","hr","h"): return ref - pd.Timedelta(hours=qty)
89
+ if unit in ("day","d"): return ref - pd.Timedelta(days=qty)
90
+ if unit in ("week","w"): return ref - pd.Timedelta(weeks=qty)
91
+ if unit in ("month","mo"): return ref - pd.Timedelta(days=qty * 30)
92
+ if unit in ("year","yr","y"): return ref - pd.Timedelta(days=qty * 365)
93
+ return pd.NaT
94
+
95
+ def clean_text(text: str) -> str:
96
+ if not isinstance(text, str): return ""
97
+ text = re.sub(r"http\S+", "", text)
98
+ text = re.sub(r"@\w+", "", text)
99
+ text = re.sub(r"#\w+", "", text)
100
+ text = re.sub(r"[^A-Za-z\s]", " ", text)
101
+ text = re.sub(r"\s+", " ", text)
102
+ return text.lower().strip()
103
+
104
+ def chunked(iterable, size):
105
+ for i in range(0, len(iterable), size):
106
+ yield iterable[i:i+size]
107
+
108
+
109
+ def teaser(s, n=TEASER_CHAR_LIMIT):
110
+ if not isinstance(s, str): return ""
111
+ s = s.strip()
112
+ return (s if len(s) <= n else s[:n-1].rsplit(" ",1)[0] + " ...")
113
+
114
+ def parse_score(x):
115
+ if pd.isna(x): return np.nan
116
+ s = str(x)
117
+ m = re.search(r"(-?\d+)", s.replace(",", ""))
118
+ if m: return int(m.group(1))
119
+ nums = re.findall(r"\d+", s)
120
+ return int(nums[0]) if nums else np.nan
121
+
122
+ def parse_time_value(v,ref_ts):
123
+ if isinstance(v, (pd.Timestamp, datetime)): return pd.to_datetime(v)
124
+ if pd.isna(v): return pd.NaT
125
+ s = str(v).strip()
126
+ try:
127
+ parsed = pd.to_datetime(s, errors='coerce', utc=None)
128
+ if pd.notna(parsed): return parsed
129
+ except Exception: pass
130
+ rt = parse_relative_time(s, ref_ts)
131
+ if pd.notna(rt): return pd.to_datetime(rt)
132
+ return pd.NaT
133
+
134
+ def compile_list(lst): return [re.compile(pat, flags=re.IGNORECASE) for pat in lst]
135
+
136
+
137
+ # ---------------- India-specific nature detection ----------------
138
+ PRO_INDIA = [r"\bjai hind\b", r"\bvande mataram\b", r"\bpro india\b", r"\bpro-india\b", r"\bsupport (?:india|modi|bjp)\b", r"\bproud of india\b", r"\bindia is great\b"]
139
+ ANTI_INDIA = [r"\banti[- ]?india\b", r"\banti national\b", r"\btraitor\b", r"\banti-india\b", r"\bkill india\b", r"\bboycott india\b"]
140
+ CRITICAL_GOVT = [r"\bmodi sucks\b", r"\bcorrupt government\b", r"\bgovernment (?:is )?failing\b", r"\b(criticis|criticize|criticising) (?:government|modi|bjp)\b", r"\bpolicy (?:failure|fail)\b", r"\banti-corruption\b", r"\bmisgovern(ance|ing)\b", r"\bgovernment (?:policy|policies)"]
141
+ SUPPORT_OPPOSITION = [r"\bsupport (?:congress|aam aadmi|aap|opposition)\b", r"\bvot(e|ing) for .*opposition\b"]
142
+ SEPARATIST = [r"\bazadi\b", r"\bseparatist\b", r"\bsecede\b", r"\bindependence for\b"]
143
+ COMMUNAL = [r"\bcommunal\b", r"\breligious (?:tension|hatred)\b", r"\breligious\b", r"\bminority\b"]
144
+ CALL_TO_ACTION = [r"\bprotest\b", r"\bboycott\b", r"\bjoin (?:the )?protest\b", r"\bstrike\b", r"\brally\b", r"\baction\b"]
145
+ CONSPIRACY = [r"\bforeign funded\b", r"\bdeep state\b", r"\bconspiracy\b", r"\bwestern plot\b", r"\bcia\b", r"\bsecret agenda\b"]
146
+
147
+ PRO_INDIA_RE = compile_list(PRO_INDIA); ANTI_INDIA_RE = compile_list(ANTI_INDIA)
148
+ CRITICAL_GOVT_RE = compile_list(CRITICAL_GOVT); SUPPORT_OPPOSITION_RE = compile_list(SUPPORT_OPPOSITION)
149
+ SEPARATIST_RE = compile_list(SEPARATIST); COMMUNAL_RE = compile_list(COMMUNAL)
150
+ CALL_TO_ACTION_RE = compile_list(CALL_TO_ACTION); CONSPIRACY_RE = compile_list(CONSPIRACY)
151
+
152
+
153
+ def text_matches_any(text, patterns):
154
+ for pat in patterns:
155
+ if pat.search(text or ""): return True
156
+ return False
157
+
158
+ def determine_nature(text, sentiment_label):
159
+ t = (text or "").lower()
160
+ if text_matches_any(t, SEPARATIST_RE): return "separatist"
161
+ if text_matches_any(t, ANTI_INDIA_RE): return "anti-india"
162
+ if text_matches_any(t, PRO_INDIA_RE): return "pro-india"
163
+ if text_matches_any(t, CALL_TO_ACTION_RE): return "call-to-action"
164
+ if text_matches_any(t, COMMUNAL_RE): return "communal"
165
+ if text_matches_any(t, CONSPIRACY_RE): return "conspiratorial"
166
+ if text_matches_any(t, CRITICAL_GOVT_RE): return "critical-of-government"
167
+ if text_matches_any(t, SUPPORT_OPPOSITION_RE): return "supportive-of-opposition"
168
+ s = str(sentiment_label).upper()
169
+ if "POS" in s: return "supportive"
170
+ if "NEG" in s: return "critical"
171
+ return "neutral"
172
+
173
+ # ---------------- DANGEROUS FLAG ----------------
174
+ danger_keywords = ["kill","attack","bomb","violence","terror","terrorist","militant","insurgency","boycott","protest","call to action"]
175
+ pattern = re.compile(r'\b(?:' + '|'.join(map(re.escape, danger_keywords)) + r')\b', flags=re.IGNORECASE)
176
+
177
+ def is_dangerous(text, sentiment):
178
+ if pattern.search(text or ""): return True
179
+ return (str(sentiment).upper() == "NEGATIVE" and text.strip() != "")
180
+
181
+ def generate_reports_from_csv(input_csv:str, out_dir:str) -> dict:
182
+ """
183
+ Runs full analysis pipeline. Returns dict: {'pdf':..., 'csv':..., 'docx':...}
184
+ """
185
+ logger.info("Running processing pipeline on %s",input_csv)
186
+ out_dir= Path(out_dir)
187
+ out_dir.mkdir(parents=True,exist_ok=True)
188
+
189
+ # ---------------- READ CSV ----------------
190
+ if not os.path.exists(input_csv):
191
+ print("CSV file not found:", input_csv); sys.exit(1)
192
+
193
+ print("Loading CSV:", input_csv)
194
+ try:
195
+ df_raw = pd.read_csv(input_csv, encoding=CSV_ENCODING, low_memory=False)
196
+ except Exception as e:
197
+ print("Error reading CSV:", e); sys.exit(1)
198
+
199
+ if MAX_ROWS:
200
+ df_raw = df_raw.head(MAX_ROWS)
201
+
202
+ title_col = "Title"
203
+ reference_col = "Reference"
204
+ subreddit_col = "Subreddit"
205
+ score_col = "Score"
206
+ comment_col = "Comments"
207
+ time_col = "Time"
208
+ author_col = "Author"
209
+ desc_col = "Description"
210
+ url_col = "Url"
211
+
212
+ if not any(c in df_raw.columns for c in [title_col, comment_col, desc_col]):
213
+ print("No text column detected. CSV columns:", list(df_raw.columns)); sys.exit(1)
214
+
215
+ # if title is None(not provided) entire column is filled with "" strings
216
+ # if title is provided but for some it is NaN after astype(str) they become "nan" not empty string
217
+ # normalized df
218
+ df = pd.DataFrame()
219
+ df["orig_index"] = df_raw.index.astype(str)
220
+ df["title"] = df_raw[title_col].fillna("").astype(str) if title_col else ""
221
+ df["reference"] = df_raw[reference_col].astype(str) if reference_col else ""
222
+ df["subreddit"] = df_raw[subreddit_col] if subreddit_col else "N/A"
223
+ df["raw_score"] = df_raw[score_col] if score_col else np.nan
224
+ df["comment"] = df_raw[comment_col].fillna("").astype(str) if comment_col else ""
225
+ df["time_raw"] = df_raw[time_col] if time_col else ""
226
+ df["username"] = df_raw[author_col] if author_col else "N/A"
227
+ df["description"] = df_raw[desc_col].fillna("").astype(str) if desc_col else ""
228
+ df["url"] = df_raw[url_col] if url_col else ""
229
+
230
+ df["text_for_analysis"] = (df["title"] + " " + df["comment"] + " " + df["description"]).str.strip()
231
+ df.loc[df["text_for_analysis"].str.strip() == "", "text_for_analysis"] = df.loc[df["text_for_analysis"].str.strip() == "", :].apply(
232
+ lambda r: " ".join([str(v) for v in r.values if isinstance(v, str) and v.strip() != ""]), axis=1
233
+ )
234
+ df["clean_text"] = df["text_for_analysis"].apply(clean_text)
235
+ df["score"] = df["raw_score"].apply(parse_score)
236
+
237
+ # parse times
238
+ try:
239
+ ref_ts = pd.to_datetime(os.path.getmtime(input_csv), unit='s')
240
+ except Exception:
241
+ ref_ts = pd.Timestamp.now()
242
+
243
+ df["created_at"] = df["time_raw"].apply(lambda x: parse_time_value(x,ref_ts))
244
+
245
+ # ---------------- SENTIMENT ----------------
246
+ print("Loading sentiment model...")
247
+
248
+ texts = df["clean_text"].tolist()
249
+ preds = []
250
+ batch_size = 32
251
+ for batch in chunked(texts, batch_size):
252
+ out = sentiment_model(batch, truncation=True)
253
+ for o in out:
254
+ label = o.get("label", "NEUTRAL")
255
+ score = float(o.get("score", 0.0))
256
+ preds.append((label, score))
257
+
258
+ df["sentiment"] = [p[0] for p in preds]
259
+ df["sentiment_score"] = [p[1] for p in preds]
260
+ # df["nature"] = df.apply(lambda r: determine_nature(r["clean_text"], r["sentiment"]), axis=1)
261
+ df["nature"] = [
262
+ determine_nature(text, sentiment)
263
+ for text, sentiment in zip(df["clean_text"], df["sentiment"])
264
+ ]
265
+
266
+
267
+ # ---------------- TOPIC MODELING ----------------
268
+ print("Performing topic modeling...")
269
+
270
+ vectorizer = CountVectorizer(stop_words="english", min_df=2)
271
+ try:
272
+ X = vectorizer.fit_transform(df["clean_text"])
273
+ except Exception as e:
274
+ print("Topic vectorization failed:", e); X = None
275
+
276
+ if X is None or X.shape[0] < 3 or len(vectorizer.get_feature_names_out()) < 5:
277
+ df["topic"] = np.nan
278
+ topic_counts = pd.Series(dtype=int)
279
+ else:
280
+ n_topics = min(TOPIC_COUNT, X.shape[0])
281
+ lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
282
+ lda.fit(X)
283
+ doc_topic = lda.transform(X)
284
+ df["topic"] = doc_topic.argmax(axis=1)
285
+ topic_counts = df["topic"].value_counts().sort_index()
286
+
287
+ df["dangerous"] = df.apply(lambda r: is_dangerous(r["clean_text"], r["sentiment"]), axis=1)
288
+ dangerous_tweets = df[df["dangerous"]].copy()
289
+ print(f"Flagged {len(dangerous_tweets)} potentially dangerous posts.")
290
+
291
+ # ---------------- VISUALS ----------------
292
+ try:
293
+ # sentiment plot
294
+ sent_counts = df["sentiment"].value_counts()
295
+ plt.figure(figsize=(6,4))
296
+ sent_counts.plot(kind="bar")
297
+ plt.title("Sentiment Distribution")
298
+ plt.tight_layout()
299
+ plt.savefig(out_dir / "sentiment.png", dpi=150)
300
+ plt.close()
301
+ # topic plot
302
+ if "topic" in df and df["topic"].notna().any():
303
+ topic_counts = df["topic"].value_counts().sort_index()
304
+ plt.figure(figsize=(6,4))
305
+ topic_counts.plot(kind="bar")
306
+ plt.title("Topic Distribution")
307
+ plt.tight_layout()
308
+ plt.savefig(out_dir / "topics.png", dpi=150)
309
+ plt.close()
310
+ # danger wordcloud
311
+ dangerous_df = df[df["dangerous"]]
312
+ if not dangerous_df.empty:
313
+ wc_text = " ".join(dangerous_df["clean_text"].tolist())
314
+ wc = WordCloud(width=1000, height=400, background_color="white", stopwords=set(STOPWORDS)).generate(wc_text)
315
+ plt.figure(figsize=(12,5))
316
+ plt.imshow(wc, interpolation="bilinear")
317
+ plt.axis("off")
318
+ plt.tight_layout()
319
+ plt.savefig(out_dir / "danger_wc.png", dpi=150)
320
+ plt.close()
321
+ except Exception as e:
322
+ logger.warning("Visuals generation failed: %s", e)
323
+
324
+
325
+ # ---------------- BUILD PDF ----------------
326
+ print("Building PDF report (LongTable for large tables)...")
327
+ pdf_out= out_dir/"report.pdf"
328
+ styles = getSampleStyleSheet()
329
+ styleN = styles["Normal"]
330
+ styleH = styles["Heading2"]
331
+ title_style = styles["Title"]
332
+ tweet_paragraph_style = ParagraphStyle("TweetStyle", parent=styles["BodyText"], fontSize=9, leading=11, spaceAfter=6, alignment=TA_LEFT)
333
+
334
+ doc = SimpleDocTemplate(pdf_out, pagesize=A4, rightMargin=36, leftMargin=36, topMargin=36, bottomMargin=36)
335
+ elements = []
336
+ elements.append(Paragraph("Reddit Posts Report (CSV Source) — India-specific Nature", title_style))
337
+ elements.append(Spacer(1, 8))
338
+ elements.append(Paragraph(f"Total Posts Processed: {len(df)}", styleN))
339
+ elements.append(Spacer(1, 8))
340
+
341
+ # Sentiment summary
342
+ elements.append(Paragraph("Sentiment Analysis Summary", styleH))
343
+ total = len(df)
344
+ for label, count in sent_counts.items():
345
+ pct = count / total * 100 if total > 0 else 0
346
+ elements.append(Paragraph(f"{label}: {count} posts ({pct:.1f}%)", styleN))
347
+ elements.append(Spacer(1, 6))
348
+ if os.path.exists("sentiment.png"):
349
+ elements.append(Image("sentiment.png", width=5.5*inch, height=3*inch))
350
+ elements.append(Spacer(1, 12))
351
+
352
+ # Topic & Nature summary
353
+ if not topic_counts.empty:
354
+ elements.append(Paragraph("Topic Modeling Summary", styleH))
355
+ for idx, val in topic_counts.items():
356
+ elements.append(Paragraph(f"Topic {int(idx)}: {int(val)} posts", styleN))
357
+ elements.append(Spacer(1, 6))
358
+ if os.path.exists("topics.png"): elements.append(Image("topics.png", width=5.5*inch, height=3*inch))
359
+ elements.append(Spacer(1, 12))
360
+
361
+ elements.append(Paragraph("Nature (India-specific) Summary", styleH))
362
+ nature_counts = df["nature"].value_counts()
363
+ for label, count in nature_counts.items():
364
+ pct = count / total * 100 if total > 0 else 0
365
+ elements.append(Paragraph(f"{label}: {count} posts ({pct:.1f}%)", styleN))
366
+ elements.append(Spacer(1, 12))
367
+
368
+ # Dangerous posts table (LongTable)
369
+ elements.append(Paragraph("Flagged Potentially Dangerous Posts", styleH))
370
+ elements.append(Spacer(1, 6))
371
+ if dangerous_tweets.empty:
372
+ elements.append(Paragraph("No dangerous posts detected.", styleN))
373
+ else:
374
+ # prepare LongTable data (header + rows)
375
+ header = ["Post (teaser)", "Subreddit", "Author", "Sentiment", "Nature", "Topic", "Date"]
376
+ lt_data = [header]
377
+ for _, row in dangerous_tweets.iterrows():
378
+ date_str = row["created_at"].strftime("%Y-%m-%d %H:%M") if pd.notna(row["created_at"]) else "N/A"
379
+ lt_data.append([
380
+ Paragraph(teaser(row["text_for_analysis"], TEASER_CHAR_LIMIT), tweet_paragraph_style),
381
+ row["subreddit"] if pd.notna(row["subreddit"]) else "N/A",
382
+ row["username"] if pd.notna(row["username"]) else "N/A",
383
+ row["sentiment"],
384
+ row["nature"],
385
+ str(int(row["topic"])) if not pd.isna(row["topic"]) else "N/A",
386
+ date_str
387
+ ])
388
+ col_widths = [3.0*inch, 0.7*inch, 0.8*inch, 0.6*inch, 0.8*inch, 0.5*inch, 1.0*inch]
389
+ lt = LongTable(lt_data, colWidths=col_widths, repeatRows=1)
390
+ # style: small font, grid, header background
391
+ lt_style = TableStyle([
392
+ ('BACKGROUND', (0,0), (-1,0), colors.HexColor("#4F81BD")),
393
+ ('TEXTCOLOR', (0,0), (-1,0), colors.whitesmoke),
394
+ ('ALIGN', (1,0), (-1,-1), 'CENTER'),
395
+ ('VALIGN', (0,0), (-1,-1), 'TOP'),
396
+ ('GRID', (0,0), (-1,-1), 0.25, colors.grey),
397
+ ('FONTNAME', (0,0), (-1,0), 'Helvetica-Bold'),
398
+ ('FONTSIZE', (0,0), (-1,-1), 8),
399
+ ('LEFTPADDING', (0,0), (-1,-1), 4),
400
+ ('RIGHTPADDING', (0,0), (-1,-1), 4),
401
+ ])
402
+ lt.setStyle(lt_style)
403
+ elements.append(lt)
404
+ elements.append(Spacer(1, 12))
405
+ if os.path.exists("danger_wc.png"):
406
+ elements.append(Paragraph("Word Cloud of Flagged Posts", styleH)); elements.append(Image("danger_wc.png", width=5.5*inch, height=2.6*inch))
407
+
408
+ elements.append(PageBreak())
409
+
410
+ # All collected posts (LongTable) - include full dataset but use teaser to avoid huge cells
411
+ elements.append(Paragraph("All Collected Posts", styles['Heading2']))
412
+ all_header = ["Date", "Subreddit", "Author", "Score", "Nature", "Post (teaser)"]
413
+ all_lt_data = [all_header]
414
+ for idx, row in df.iterrows():
415
+ date_str = row["created_at"].strftime("%Y-%m-%d %H:%M") if pd.notna(row["created_at"]) else "N/A"
416
+ all_lt_data.append([
417
+ date_str,
418
+ row["subreddit"] if pd.notna(row["subreddit"]) else "N/A",
419
+ row["username"] if pd.notna(row["username"]) else "N/A",
420
+ str(row["score"]) if not pd.isna(row["score"]) else "N/A",
421
+ row["nature"],
422
+ Paragraph(teaser(row["text_for_analysis"], TEASER_CHAR_LIMIT), tweet_paragraph_style)
423
+ ])
424
+
425
+ all_col_widths = [1.0*inch, 1.0*inch, 1.0*inch, 0.7*inch, 0.9*inch, 2.8*inch]
426
+ all_lt = LongTable(all_lt_data, colWidths=all_col_widths, repeatRows=1)
427
+ all_lt.setStyle(TableStyle([
428
+ ('BACKGROUND', (0,0), (-1,0), colors.HexColor("#4F81BD")),
429
+ ('TEXTCOLOR', (0,0), (-1,0), colors.whitesmoke),
430
+ ('GRID', (0,0), (-1,-1), 0.25, colors.grey),
431
+ ('VALIGN', (0,0), (-1,-1), 'TOP'),
432
+ ('FONTSIZE', (0,0), (-1,-1), 8),
433
+ ('LEFTPADDING', (0,0), (-1,-1), 4),
434
+ ('RIGHTPADDING', (0,0), (-1,-1), 4),
435
+ ]))
436
+ elements.append(all_lt)
437
+
438
+ # finish PDF
439
+ doc = SimpleDocTemplate(str(pdf_out))
440
+ doc.build(elements)
441
+ print("✅ PDF saved as:", pdf_out)
442
+
443
+ # ---------------- SAVE CSV (full enriched) ----------------
444
+ csv_out = out_dir/"analysis_output.csv"
445
+ df_out = df.copy()
446
+ df_out["created_at_str"] = df_out["created_at"].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S") if pd.notna(x) else "")
447
+ df_out.to_csv(csv_out, index=False, encoding="utf-8")
448
+ print("✅ Enriched CSV saved as:", csv_out)
449
+
450
+
451
+ # ---------------- DOCX EXPORT (optional) ----------------
452
+ if not DOCX_AVAILABLE:
453
+ print("python-docx not installed — skipping DOCX export. Install via: pip install python-docx")
454
+ else:
455
+ try:
456
+ print("Building DOCX report...")
457
+ DOCX_OUTPUT= out_dir/"report.docx"
458
+ docx = Document()
459
+ docx.add_heading("Reddit Posts Report (India-specific Nature)", level=1)
460
+ docx.add_paragraph(f"Total Posts Processed: {len(df)}")
461
+ docx.add_heading("Sentiment Analysis Summary", level=2)
462
+ for label, count in sent_counts.items():
463
+ pct = count / total * 100 if total > 0 else 0
464
+ docx.add_paragraph(f"{label}: {count} posts ({pct:.1f}%)")
465
+
466
+ docx.add_heading("Nature Summary", level=2)
467
+ for label, count in nature_counts.items():
468
+ pct = count / total * 100 if total > 0 else 0
469
+ docx.add_paragraph(f"{label}: {count} posts ({pct:.1f}%)")
470
+
471
+ # add small sample table (first 200 rows or less)
472
+ sample_n = min(200, len(df))
473
+ docx.add_heading(f"Sample of First {sample_n} Posts", level=2)
474
+ table = docx.add_table(rows=1, cols=6)
475
+ hdr_cells = table.rows[0].cells
476
+ hdr_cells[0].text = "Date"
477
+ hdr_cells[1].text = "Subreddit"
478
+ hdr_cells[2].text = "Author"
479
+ hdr_cells[3].text = "Score"
480
+ hdr_cells[4].text = "Nature"
481
+ hdr_cells[5].text = "Post (teaser)"
482
+ for idx, row in df.head(sample_n).iterrows():
483
+ row_cells = table.add_row().cells
484
+ date_str = row["created_at"].strftime("%Y-%m-%d %H:%M") if pd.notna(row["created_at"]) else "N/A"
485
+ row_cells[0].text = date_str
486
+ row_cells[1].text = str(row["subreddit"]) if pd.notna(row["subreddit"]) else "N/A"
487
+ row_cells[2].text = str(row["username"]) if pd.notna(row["username"]) else "N/A"
488
+ row_cells[3].text = str(row["score"]) if not pd.isna(row["score"]) else "N/A"
489
+ row_cells[4].text = str(row["nature"])
490
+ row_cells[5].text = teaser(row["text_for_analysis"], 300)
491
+
492
+ docx.save(DOCX_OUTPUT)
493
+ print("✅ DOCX saved as:", DOCX_OUTPUT)
494
+ except Exception as e:
495
+ logger.exception("DOCX creation failed: %s", e)
496
+ if DOCX_OUTPUT.exists():
497
+ try:
498
+ DOCX_OUTPUT.unlink(missing_ok=True)
499
+ except Exception:
500
+ pass
501
+ logger.info("Processor: finished, files at %s", out_dir)
502
+ return {"pdf": str(pdf_out), "csv": str(csv_out), "docx": str(DOCX_OUTPUT) if DOCX_OUTPUT.exists() else ""}
503
+
reddit_scrapper.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import csv
3
+ import time
4
+ import logging
5
+ from pathlib import Path
6
+ from datetime import datetime, timezone
7
+ from typing import Iterable, List, Optional
8
+ from dotenv import load_dotenv
9
+
10
+ import praw
11
+ import prawcore
12
+ import pytz
13
+
14
+ logger = logging.getLogger("reddit_scraper")
15
+ logger.setLevel(logging.INFO)
16
+
17
+ load_dotenv()
18
+
19
+ # default queries (copied from your Selenium version)
20
+ political_queries: List[str] = [
21
+ "india politics",
22
+ "india protest",
23
+ "india government fail",
24
+ "india corruption",
25
+ "india democracy threat",
26
+ "india dictatorship",
27
+ "india religious violence",
28
+ "india communal riots",
29
+ "india anti muslim",
30
+ "india anti sikh",
31
+ "india caste violence",
32
+ "india hate speech",
33
+ "india freedom struggle",
34
+ "india human rights violation",
35
+ "india farmers protest",
36
+ "india caa protest",
37
+ "india nrc protest",
38
+ "india modi resign",
39
+ "india bjp fail",
40
+ "india rss agenda",
41
+ "india fake news",
42
+ "india propaganda",
43
+ "india media blackout",
44
+ "boycott india",
45
+ "boycott indian products",
46
+ "boycott bollywood",
47
+ "kashmir freedom",
48
+ "kashmir human rights",
49
+ "kashmir india occupation",
50
+ "kashmir protest",
51
+ "khalistan movement",
52
+ "punjab separatism",
53
+ "anti national india",
54
+ "down with india",
55
+ "stop india aggression",
56
+ "india pakistan conflict",
57
+ "china india border",
58
+ "india brutality",
59
+ "india minority oppression"
60
+ ]
61
+
62
+ def _init_reddit():
63
+ """Initialize a PRAW Reddit instance using environment variables."""
64
+ client_id = os.environ.get("REDDIT_CLIENT_ID")
65
+ client_secret = os.environ.get("REDDIT_CLIENT_SECRET")
66
+ user_agent = os.environ.get("REDDIT_USER_AGENT", "reddit_scraper:v1.0")
67
+
68
+ logger.info(f"Initializing Reddit with ClientID: {client_id}, Agent: {user_agent}")
69
+
70
+ if not client_id or not client_secret:
71
+ logger.error("Missing REDDIT_CLIENT_ID or REDDIT_CLIENT_SECRET env vars")
72
+ raise EnvironmentError(
73
+ "REDDIT_CLIENT_ID and REDDIT_CLIENT_SECRET must be set as environment variables."
74
+ )
75
+
76
+ return praw.Reddit(
77
+ client_id=client_id,
78
+ client_secret=client_secret,
79
+ user_agent=user_agent,
80
+ check_for_async=False # prevents accidental async loop issues
81
+ )
82
+
83
+ def _format_time(created_utc: Optional[float]) -> str:
84
+ """Return timestamp string in UTC 'YYYY-MM-DD HH:MM:SS' (fallback 'N/A')."""
85
+ if not created_utc:
86
+ return "N/A"
87
+ # use UTC time for consistency
88
+ dt = datetime.fromtimestamp(created_utc, tz=timezone.utc)
89
+ return dt.strftime("%Y-%m-%d %H:%M:%S")
90
+
91
+ def scrape_reddit_to_csv(
92
+ output_csv_path: str,
93
+ per_query_limit: int,
94
+ total_limit: int,
95
+ delay_between_queries: float = 1.5
96
+ ) -> int:
97
+ """
98
+ Scrape reddit using PRAW and save results to output_csv_path.
99
+ - per_query_limit: max results to request per query (PRAW will respect rate limits)
100
+ - total_limit: overall cap on number of rows written
101
+ - returns: number of rows written
102
+ """
103
+
104
+ try:
105
+ reddit = _init_reddit()
106
+ logger.info(f"Reddit instance created. Read-only: {reddit.read_only}")
107
+ except Exception as e:
108
+ logger.exception(f"Failed to init reddit: {e}")
109
+ raise
110
+
111
+ Path(output_csv_path).parent.mkdir(parents=True, exist_ok=True)
112
+ logger.info("Running PRAW scraper and saving CSV to %s", output_csv_path)
113
+
114
+ written = 0
115
+ seen_ids = set()
116
+
117
+ header = ["Title", "Reference", "Score", "Comments", "Time", "Author", "Subreddit", "Description", "Url"]
118
+
119
+ with open(output_csv_path, "w", newline="", encoding="utf-8") as fh:
120
+ writer = csv.writer(fh)
121
+ writer.writerow(header)
122
+
123
+ try:
124
+ for query in political_queries:
125
+ if written >= total_limit:
126
+ logger.info("Reached total_limit=%s, stopping.", total_limit)
127
+ break
128
+
129
+ logger.info("Searching Reddit for query: %s (limit=%s)", query, per_query_limit)
130
+ try:
131
+ # search on r/all
132
+ submissions = reddit.subreddit("all").search(query, sort="new", limit=per_query_limit)
133
+ # Force a generator fetch to check for immediate auth errors
134
+ # submissions = list(submissions)
135
+ except prawcore.exceptions.RequestException as e:
136
+ logger.warning("Network error during PRAW search for '%s': %s", query, e)
137
+ time.sleep(2)
138
+ continue
139
+ except Exception as e:
140
+ logger.exception("PRAW search failed for '%s': %s", query, e)
141
+ time.sleep(2)
142
+ continue
143
+
144
+ keywords = [kw.lower() for kw in query.split() if kw.strip()]
145
+
146
+ for sub in submissions:
147
+ if written >= total_limit:
148
+ break
149
+
150
+ try:
151
+ sid = getattr(sub, "id", None)
152
+ if not sid:
153
+ continue
154
+ if sid in seen_ids:
155
+ continue
156
+ seen_ids.add(sid)
157
+
158
+ title = getattr(sub, "title", "") or ""
159
+ reference = sid
160
+ score = getattr(sub, "score", 0) or 0
161
+ comments = getattr(sub, "num_comments", 0) or 0
162
+ created = _format_time(getattr(sub, "created_utc", None))
163
+ author = getattr(sub.author, "name", "deleted") if getattr(sub, "author", None) else "deleted"
164
+ subreddit = getattr(sub.subreddit, "display_name", "") or ""
165
+ description = getattr(sub, "selftext", "") or ""
166
+ url = getattr(sub, "url", "") or ""
167
+
168
+ # replicate the original filtering: ensure query keywords appear in title or description
169
+ text_for_check = f"{title} {description}".lower()
170
+ if keywords and not any(kw in text_for_check for kw in keywords):
171
+ # skip items that don't appear relevant
172
+ continue
173
+
174
+ writer.writerow([title, reference, score, comments, created, author, subreddit, description, url])
175
+ written += 1
176
+
177
+ except Exception as e:
178
+ # don't stop the whole scraper for one failing submission
179
+ logger.exception("Failed to process submission %s: %s", getattr(sub, "id", "<no-id>"), e)
180
+ continue
181
+
182
+ # respectful delay between queries to reduce risk of rate limiting
183
+ time.sleep(delay_between_queries)
184
+
185
+ except KeyboardInterrupt:
186
+ logger.warning("Scraper interrupted by user.")
187
+ except Exception as e:
188
+ logger.exception("Unhandled exception during scraping: %s", e)
189
+
190
+ logger.info("Scraper finished: wrote %d rows to %s", written, output_csv_path)
191
+ return written
192
+
requirements.txt ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+
4
+ pandas
5
+ numpy
6
+ scikit-learn
7
+
8
+ matplotlib
9
+ wordcloud
10
+ reportlab
11
+ python-docx
12
+
13
+ praw
14
+ requests
15
+ python-dotenv
16
+
17
+ transformers
18
+ torch
19
+ tokenizers
20
+
21
+ tqdm
22
+
23
+
24
+
25
+
26
+
27
+
28
+
29
+
30
+
31
+
32
+
33
+
34
+
35
+
36
+
37
+
38
+ # absl-py==2.3.1
39
+ # annotated-types==0.7.0
40
+ # anyio==4.10.0
41
+ # astunparse==1.6.3
42
+ # attrs==25.3.0
43
+ # certifi==2025.8.3
44
+ # cffi==1.17.1
45
+ # charset-normalizer==3.4.3
46
+ # click==8.2.1
47
+ # colorama==0.4.6
48
+ # contourpy==1.3.3
49
+ # cycler==0.12.1
50
+ # fastapi==0.116.1
51
+ # filelock==3.19.1
52
+ # flatbuffers==25.2.10
53
+ # fonttools==4.59.2
54
+ # fsspec==2025.7.0
55
+ # gast==0.6.0
56
+ # google-pasta==0.2.0
57
+ # grpcio==1.74.0
58
+ # h11==0.16.0
59
+ # h5py==3.14.0
60
+ # huggingface-hub==0.34.4
61
+ # idna==3.10
62
+ # Jinja2==3.1.4
63
+ # joblib==1.5.2
64
+ # kiwisolver==1.4.9
65
+ # libclang==18.1.1
66
+ # lxml==6.0.1
67
+ # Markdown==3.8.2
68
+ # markdown-it-py==4.0.0
69
+ # matplotlib==3.10.8
70
+ # mdurl==0.1.2
71
+ # ml_dtypes==0.5.3
72
+ # mpmath==1.3.0
73
+ # namex==0.1.0
74
+ # networkx==3.3
75
+ # numpy==2.3.2
76
+ # opt_einsum==3.4.0
77
+ # optree==0.17.0
78
+ # outcome==1.3.0.post0
79
+ # packaging==25.0
80
+ # pandas==2.3.2
81
+ # pillow==12.1.0
82
+ # praw==7.8.1
83
+ # prawcore==2.4.0
84
+ # protobuf==6.32.0
85
+ # pycparser==2.22
86
+ # pydantic==2.11.7
87
+ # pydantic_core==2.33.2
88
+ # Pygments==2.19.2
89
+ # pyparsing==3.2.3
90
+ # PySocks==1.7.1
91
+ # python-dateutil==2.9.0.post0
92
+ # python-docx==1.2.0
93
+ # python-dotenv==1.2.1
94
+ # pytz==2025.2
95
+ # PyYAML==6.0.2
96
+ # regex==2025.8.29
97
+ # reportlab==4.4.3
98
+ # requests==2.32.5
99
+ # rich==14.1.0
100
+ # safetensors==0.6.2
101
+ # scikit-learn==1.7.1
102
+ # scipy==1.16.1
103
+ # selenium==4.35.0
104
+ # setuptools==80.9.0
105
+ # six==1.17.0
106
+ # sniffio==1.3.1
107
+ # sortedcontainers==2.4.0
108
+ # starlette==0.47.3
109
+ # sympy==1.13.3
110
+ # tensorboard==2.20.0
111
+ # tensorboard-data-server==0.7.2
112
+ # termcolor==3.1.0
113
+ # threadpoolctl==3.6.0
114
+ # tokenizers==0.22.0
115
+ # torch==2.8.0+cpu
116
+ # torchaudio==2.8.0+cpu
117
+ # torchvision==0.23.0+cpu
118
+ # tqdm==4.67.1
119
+ # transformers==4.56.0
120
+ # trio==0.30.0
121
+ # trio-websocket==0.12.2
122
+ # typing-inspection==0.4.1
123
+ # typing_extensions==4.15.0
124
+ # tzdata==2025.2
125
+ # update-checker==0.18.0
126
+ # urllib3==2.5.0
127
+ # uvicorn==0.35.0
128
+ # websocket-client==1.8.0
129
+ # Werkzeug==3.1.3
130
+ # wheel==0.45.1
131
+ # wordcloud==1.9.4
132
+ # wrapt==1.17.3
133
+ # wsproto==1.2.0