Spaces:

WizardCoder2007
/

social_media_analyzer

Sleeping

App Files Files Community

WizardCoder2007 commited on Jan 13

Commit

1e36e31

1 Parent(s): 8892a43

commit

Browse files

Files changed (6) hide show

.gitignore +18 -0
Dockerfile +25 -0
main.py +326 -0
processor.py +503 -0
reddit_scrapper.py +192 -0
requirements.txt +133 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,18 @@

+venv/
+.venv/
+__pycache__/
+*.pyc
+.env
+.env.local
+.env.example
+.env.debug
+.env.production
+.env.test
+.env.development
+storage/latest/
+*.png
+*.pdf
+*.csv
+*.json
+*.docx

Dockerfile ADDED Viewed

	@@ -0,0 +1,25 @@

+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+WORKDIR /app
+# System deps (for matplotlib, reportlab, wordcloud)
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    gcc \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender1 \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

main.py ADDED Viewed

	@@ -0,0 +1,326 @@

+import requests,time,csv,re,json,sys,math,random,io
+import uuid,shutil,logging,os
+from pathlib import Path
+from typing import Optional,Tuple
+from datetime import datetime, timezone,timedelta
+from fastapi import FastAPI, Query, HTTPException, Header, BackgroundTasks, Request, Response
+from fastapi.responses import HTMLResponse, JSONResponse,StreamingResponse,FileResponse
+from fastapi.middleware.cors import CORSMiddleware
+from starlette.concurrency import run_in_threadpool
+from pydantic import BaseModel
+from typing import Literal
+try:
+    import processor
+except Exception as e:
+    raise RuntimeError(f"Failed to import processor.py: {e}")
+from reddit_scrapper import scrape_reddit_to_csv
+# try import python-docx (optional)
+DOCX_AVAILABLE = True
+try:
+    from docx import Document
+    from docx.shared import Inches
+except Exception:
+    DOCX_AVAILABLE = False
+class RerunRequest(BaseModel):
+    intent: Literal["light", "medium", "deep"]
+INTENT_LIMITS = {
+    "light":  {"per_query": 20,  "total": 40},
+    "medium": {"per_query": 50,  "total": 300},
+    "deep":   {"per_query": 100, "total": 800},
+}
+# ---- Configuration ----
+BASE_DIR= Path(__file__).resolve().parent
+STORAGE_DIR= BASE_DIR/"storage"
+LATEST_DIR= STORAGE_DIR/"latest"
+STORAGE_DIR.mkdir(exist_ok=True)
+LATEST_DIR.mkdir(exist_ok=True)
+# API key (optional) if set in env required for post/rerun
+API_KEY= os.environ.get("API_KEY",None)
+# logging
+logging.basicConfig(level=logging.INFO)
+logger= logging.getLogger("report-saver")
+# FastAPI code
+app= FastAPI(title="Auto Report API (CSV → PDF/DOCX)")
+# CORS allow all in dev, restrict in production
+origins=[
+    "https://ciis-indol.vercel.app",
+    "http://localhost:8080",
+    "http://127.0.0.1:8080",
+    "http://localhost:5173",
+    "http://127.0.0.1:5173",
+    "http://localhost:8000",
+    "http://127.0.0.1:8000"
+]
+app.add_middleware(CORSMiddleware, allow_origins=origins,allow_credentials=True, allow_methods=["*"],allow_headers=["*"])
+# Helper: safe path join inside storage using Path
+def storage_path(filename:str)-> Path:
+    return LATEST_DIR/filename
+def scrape_live_data(output_csv_path:str, per_query: int, total:int)->None:
+    scrape_reddit_to_csv(output_csv_path,per_query,total)
+# ------------------------------
+# Range-supporting file response for large files (PDF preview)
+def get_range_byte_positions(range_header: str, file_size: int) -> Optional[Tuple[int, int]]:
+    # Example Range header: 'bytes=0-1023' or 'bytes=1024-'
+    if not range_header:
+        return None
+    header = range_header.strip()
+    if not header.startswith("bytes="):
+        return None
+    range_val = header.split("=", 1)[1]
+    parts = range_val.split("-")
+    try:
+        if parts[0] == "":
+            # suffix bytes: '-N' -> last N bytes
+            end = file_size - 1
+            start = file_size - int(parts[1])
+        elif parts[1] == "":
+            # 'start-' to end of file
+            start = int(parts[0])
+            end = file_size - 1
+        else:
+            start = int(parts[0])
+            end = int(parts[1])
+        if start < 0:
+            start = 0
+        if end >= file_size:
+            end = file_size - 1
+        if start > end:
+            return None
+        return (start, end)
+    except Exception:
+        return None
+def range_stream_response(path: Path, request: Request) -> StreamingResponse:
+    """Return a StreamingResponse that honors Range requests for a file."""
+    file_size = path.stat().st_size
+    range_header = request.headers.get("range")
+    range_pos = get_range_byte_positions(range_header, file_size)
+    headers = {
+        "Accept-Ranges": "bytes",
+        "Content-Type": "application/octet-stream",
+        "Content-Disposition": f'inline; filename="{path.name}"',
+    }
+    if range_pos is None:
+        # full content
+        def iterfile():
+            with open(path, "rb") as f:
+                while True:
+                    chunk = f.read(1024 * 1024)
+                    if not chunk:
+                        break
+                    yield chunk
+        headers["Content-Length"] = str(file_size)
+        return StreamingResponse(iterfile(), status_code=200, headers=headers)
+    else:
+        start, end = range_pos
+        length = end - start + 1
+        headers["Content-Length"] = str(length)
+        headers["Content-Range"] = f"bytes {start}-{end}/{file_size}"
+        # status 206 Partial Content
+        def iterfile_range():
+            with open(path, "rb") as f:
+                f.seek(start)
+                remaining = length
+                chunk_size = 1024 * 1024
+                while remaining > 0:
+                    to_read = min(chunk_size, remaining)
+                    chunk = f.read(to_read)
+                    if not chunk:
+                        break
+                    remaining -= len(chunk)
+                    yield chunk
+        return StreamingResponse(iterfile_range(), status_code=206, headers=headers)
+@app.get("/")
+def home():
+    return {"message":"sever working"}
+@app.post("/rerun")
+async def rerun_endpoint(body: RerunRequest, x_api_key: Optional[str] = Header(None)):
+    """
+    Trigger live scraping + processing.
+    Optional x-api-key header if API_KEY is set in env.
+    This endpoint blocks until processing completes and returns file paths.
+    """
+    # auth check
+    if API_KEY:
+        if not x_api_key or x_api_key != API_KEY:
+            logger.warning("Rejected rerun: invalid API key")
+            raise HTTPException(status_code=401, detail="Invalid or missing x-api-key")
+    # create a new working folder
+    # uid = uuid.uuid4().hex
+    work_dir = STORAGE_DIR / "latest"
+    work_dir.mkdir(parents=True, exist_ok=True)
+    # step 1: scrape live data -> create input CSV path
+    input_csv = work_dir / "scraped_input.csv"
+    limits= INTENT_LIMITS[body.intent]
+    logger.info(f"Received rerun request. Intent: {body.intent}, Limits: {limits}")
+    try:
+        logger.info(f"Starting scraping to {input_csv}...")
+        scrape_live_data(str(input_csv),int(limits["per_query"]),int(limits["total"]))
+        logger.info("Scraping completed successfully.")
+    except Exception as e:
+        logger.exception("Scraping failed: %s", e)
+        raise HTTPException(status_code=500, detail=f"Scraping failed: {e}")
+    # step 2: process csv into pdf, docx, analysis_output.csv
+    try:
+        logger.info("Calling user-provided processor.generate_reports_from_csv")
+        # assume processor writes to out_dir and returns dict or nothing
+        out = processor.generate_reports_from_csv(str(input_csv), str(work_dir))
+        logger.info(f"Processing return value: {out}")
+        # normalize result
+        pdf_path = str(work_dir / "report.pdf")
+        csv_path = str(work_dir / "analysis_output.csv")
+        docx_path = str(work_dir / "report.docx")
+        # if processor returned explicit paths, use them
+        if isinstance(out, dict):
+            pdf_path = out.get("pdf", pdf_path)
+            csv_path = out.get("csv", csv_path)
+            docx_path = out.get("docx", docx_path)
+        result = {"pdf": pdf_path, "csv": csv_path, "docx": docx_path}
+    except Exception as e:
+        logger.exception("Processing failed: %s", e)
+        raise HTTPException(status_code=500, detail=f"Processing failed: {e}")
+    # step 3: update 'latest' storage (atomically)
+    try:
+        # clear latest directory
+        # if LATEST_DIR.exists():
+        #     shutil.rmtree(LATEST_DIR)
+        LATEST_DIR.mkdir(parents=True, exist_ok=True)
+        # Define IST timezone
+        IST = timezone(timedelta(hours=5, minutes=30))
+        generated_at = datetime.now(IST).strftime("%Y-%m-%d %H:%M:%S")
+        # write metadata file
+        meta = {
+            "pdf": "/files/report.pdf" if (LATEST_DIR / "report.pdf").exists() else "",
+            "csv": "/files/analysis_output.csv" if (LATEST_DIR / "analysis_output.csv").exists() else "",
+            "docx": "/files/report.docx" if (LATEST_DIR / "report.docx").exists() else "",
+            "generated_at": generated_at,
+        }
+        # write meta to disk for persistence
+        with open(LATEST_DIR / "meta.json", "w", encoding="utf-8") as mf:
+            import json
+            json.dump(meta, mf)
+    except Exception as e:
+        logger.exception("Failed to update latest storage: %s", e)
+        raise HTTPException(status_code=500, detail=f"Failed to update latest storage: {e}")
+    logger.info("Rerun completed, files available under latest/ directory")
+    return JSONResponse(status_code=200, content={
+        "status": "ok",
+        "pdf": meta["pdf"],
+        "csv": meta["csv"],
+        "docx": meta["docx"]
+    })
+@app.get("/report")
+async def get_report():
+    """
+    Return metadata about current report (pdf/csv/docx)
+    """
+    meta_file = LATEST_DIR / "meta.json"
+    if not meta_file.exists():
+        raise HTTPException(status_code=404, detail="No report available yet")
+    import json
+    with open(meta_file, "r", encoding="utf-8") as f:
+        meta = json.load(f)
+    return JSONResponse(status_code=200, content=meta)
+@app.get("/pdf/view/{filename}")
+async def view_pdf(filename: str):
+    path = LATEST_DIR / filename
+    if not path.exists():
+        raise HTTPException(404, "File not found")
+    return FileResponse(
+        path,
+        media_type="application/pdf",
+        headers={
+            "Content-Disposition": f'inline; filename="{path.name}"'
+        }
+    )
+@app.get("/pdf/download/{filename}")
+async def download_pdf(filename: str):
+    path = LATEST_DIR / filename
+    if not path.exists():
+        raise HTTPException(404, "File not found")
+    return FileResponse(
+        path,
+        media_type="application/pdf",
+        headers={
+            "Content-Disposition": f'attachment; filename="{path.name}"'
+        }
+    )
+@app.get("/files/{filename}")
+async def serve_file(filename: str, request: Request):
+    """
+    Serve files from the latest directory. Supports Range requests (for PDFs).
+    """
+    safe_name = os.path.basename(filename)
+    path = LATEST_DIR / safe_name
+    if not path.exists() or not path.is_file():
+        raise HTTPException(status_code=404, detail="File not found")
+    # Detect file type
+    if path.suffix.lower() == ".pdf":
+        media_type = "application/pdf"
+    elif path.suffix.lower() == ".csv":
+        media_type = "text/csv"
+    elif path.suffix.lower() == ".docx":
+        media_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+    else:
+        media_type = "application/octet-stream"
+    # if the client supports Range (commonly for PDFs), use range_stream_response
+    range_header = request.headers.get("range")
+    if range_header and path.suffix.lower() == ".pdf":
+        return range_stream_response(path, request)
+    else:
+        # full file streaming
+        def file_iterator():
+            with open(path, "rb") as f:
+                while True:
+                    chunk = f.read(1024 * 1024)
+                    if not chunk:
+                        break
+                    yield chunk
+        headers = {
+            "Content-Disposition": f'inline; filename="{path.name}"',
+            "Content-Length": str(path.stat().st_size),
+        }
+        return StreamingResponse(file_iterator(), media_type=media_type, headers=headers)
+if __name__=='__main__':
+    import uvicorn
+    port= int(os.environ.get("PORT",8000))
+    logger.info("Starting on port %s",port)
+    uvicorn.run("main:app",host="0.0.0.0",port=port,log_level="info")

processor.py ADDED Viewed

	@@ -0,0 +1,503 @@

+"""
+Updated: supports large tables using LongTable + docx export.
+Processor module.
+Expose: generate_reports_from_csv(input_csv: str, out_dir: str) -> dict
+Produces: out_dir/analysis_output.csv, out_dir/report.pdf, out_dir/report.docx (optional)
+"""
+import os,re,sys,csv,logging
+from datetime import datetime
+from pathlib import Path
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from wordcloud import WordCloud, STOPWORDS
+from transformers import pipeline
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.decomposition import LatentDirichletAllocation
+# reportlab platypus
+from reportlab.platypus import (SimpleDocTemplate, Paragraph, Spacer, PageBreak,
+                                TableStyle, Image, LongTable)
+from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+from reportlab.lib import colors
+from reportlab.lib.pagesizes import A4
+from reportlab.lib.units import inch
+from reportlab.lib.enums import TA_LEFT
+# try import python-docx (optional)
+DOCX_AVAILABLE = True
+try:
+    from docx import Document
+    from docx.shared import Inches
+except Exception:
+    DOCX_AVAILABLE = False
+logger = logging.getLogger("processor")
+logger.setLevel(logging.INFO)
+# ---------------- CONFIG ----------------
+CSV_ENCODING = "utf-8"
+MAX_ROWS = None          # None => all rows
+TOPIC_COUNT = 3
+# Table teaser length to avoid massive single-cell height in PDF tables
+TEASER_CHAR_LIMIT = 900
+# ---------------- UTIL ----------------
+RELATIVE_TIME_RE = re.compile(
+    r'(?:(\d+)\s*(second|sec|s|minute|min|m|hour|hr|h|day|d|week|w|month|mo|year|yr|y)s?\s*ago)|\b(yesterday|today|just now|now)\b',
+    flags=re.IGNORECASE
+)
+try:
+    import torch
+    device = 0 if torch.cuda.is_available() else -1
+except Exception:
+    device = -1
+try:
+    sentiment_model = pipeline("sentiment-analysis",
+                            model="distilbert-base-uncased-finetuned-sst-2-english",
+                            device=device)
+except Exception as e:
+    print("Failed to load requested model:", e)
+    try:
+        sentiment_model = pipeline("sentiment-analysis", device=device)
+    except Exception as ex:
+        print("Final sentiment pipeline fallback failed:", ex); sys.exit(1)
+def parse_relative_time(s: str, ref: pd.Timestamp):
+    if not isinstance(s, str) or s.strip() == "":
+        return pd.NaT
+    s = s.strip().lower()
+    if s in ("just now", "now"):
+        return ref
+    if s == "today":
+        return pd.Timestamp(ref.date())
+    if s == "yesterday":
+        return ref - pd.Timedelta(days=1)
+    s = re.sub(r'\b(an|a)\b', '1', s)
+    m = re.search(r'(\d+)\s*(second|sec|s|minute|min|m|hour|hr|h|day|d|week|w|month|mo|year|yr|y)s?\s*ago', s)
+    if not m:
+        return pd.NaT
+    qty = int(m.group(1)); unit = m.group(2).lower()
+    if unit in ("second","sec","s"): return ref - pd.Timedelta(seconds=qty)
+    if unit in ("minute","min","m"): return ref - pd.Timedelta(minutes=qty)
+    if unit in ("hour","hr","h"): return ref - pd.Timedelta(hours=qty)
+    if unit in ("day","d"): return ref - pd.Timedelta(days=qty)
+    if unit in ("week","w"): return ref - pd.Timedelta(weeks=qty)
+    if unit in ("month","mo"): return ref - pd.Timedelta(days=qty * 30)
+    if unit in ("year","yr","y"): return ref - pd.Timedelta(days=qty * 365)
+    return pd.NaT
+def clean_text(text: str) -> str:
+    if not isinstance(text, str): return ""
+    text = re.sub(r"http\S+", "", text)
+    text = re.sub(r"@\w+", "", text)
+    text = re.sub(r"#\w+", "", text)
+    text = re.sub(r"[^A-Za-z\s]", " ", text)
+    text = re.sub(r"\s+", " ", text)
+    return text.lower().strip()
+def chunked(iterable, size):
+    for i in range(0, len(iterable), size):
+        yield iterable[i:i+size]
+def teaser(s, n=TEASER_CHAR_LIMIT):
+    if not isinstance(s, str): return ""
+    s = s.strip()
+    return (s if len(s) <= n else s[:n-1].rsplit(" ",1)[0] + " ...")
+def parse_score(x):
+    if pd.isna(x): return np.nan
+    s = str(x)
+    m = re.search(r"(-?\d+)", s.replace(",", ""))
+    if m: return int(m.group(1))
+    nums = re.findall(r"\d+", s)
+    return int(nums[0]) if nums else np.nan
+def parse_time_value(v,ref_ts):
+    if isinstance(v, (pd.Timestamp, datetime)): return pd.to_datetime(v)
+    if pd.isna(v): return pd.NaT
+    s = str(v).strip()
+    try:
+        parsed = pd.to_datetime(s, errors='coerce', utc=None)
+        if pd.notna(parsed): return parsed
+    except Exception: pass
+    rt = parse_relative_time(s, ref_ts)
+    if pd.notna(rt): return pd.to_datetime(rt)
+    return pd.NaT
+def compile_list(lst): return [re.compile(pat, flags=re.IGNORECASE) for pat in lst]
+# ---------------- India-specific nature detection ----------------
+PRO_INDIA = [r"\bjai hind\b", r"\bvande mataram\b", r"\bpro india\b", r"\bpro-india\b", r"\bsupport (?:india|modi|bjp)\b", r"\bproud of india\b", r"\bindia is great\b"]
+ANTI_INDIA = [r"\banti[- ]?india\b", r"\banti national\b", r"\btraitor\b", r"\banti-india\b", r"\bkill india\b", r"\bboycott india\b"]
+CRITICAL_GOVT = [r"\bmodi sucks\b", r"\bcorrupt government\b", r"\bgovernment (?:is )?failing\b", r"\b(criticis|criticize|criticising) (?:government|modi|bjp)\b", r"\bpolicy (?:failure|fail)\b", r"\banti-corruption\b", r"\bmisgovern(ance|ing)\b", r"\bgovernment (?:policy|policies)"]
+SUPPORT_OPPOSITION = [r"\bsupport (?:congress|aam aadmi|aap|opposition)\b", r"\bvot(e|ing) for .*opposition\b"]
+SEPARATIST = [r"\bazadi\b", r"\bseparatist\b", r"\bsecede\b", r"\bindependence for\b"]
+COMMUNAL = [r"\bcommunal\b", r"\breligious (?:tension|hatred)\b", r"\breligious\b", r"\bminority\b"]
+CALL_TO_ACTION = [r"\bprotest\b", r"\bboycott\b", r"\bjoin (?:the )?protest\b", r"\bstrike\b", r"\brally\b", r"\baction\b"]
+CONSPIRACY = [r"\bforeign funded\b", r"\bdeep state\b", r"\bconspiracy\b", r"\bwestern plot\b", r"\bcia\b", r"\bsecret agenda\b"]
+PRO_INDIA_RE = compile_list(PRO_INDIA); ANTI_INDIA_RE = compile_list(ANTI_INDIA)
+CRITICAL_GOVT_RE = compile_list(CRITICAL_GOVT); SUPPORT_OPPOSITION_RE = compile_list(SUPPORT_OPPOSITION)
+SEPARATIST_RE = compile_list(SEPARATIST); COMMUNAL_RE = compile_list(COMMUNAL)
+CALL_TO_ACTION_RE = compile_list(CALL_TO_ACTION); CONSPIRACY_RE = compile_list(CONSPIRACY)
+def text_matches_any(text, patterns):
+    for pat in patterns:
+        if pat.search(text or ""): return True
+    return False
+def determine_nature(text, sentiment_label):
+    t = (text or "").lower()
+    if text_matches_any(t, SEPARATIST_RE): return "separatist"
+    if text_matches_any(t, ANTI_INDIA_RE): return "anti-india"
+    if text_matches_any(t, PRO_INDIA_RE): return "pro-india"
+    if text_matches_any(t, CALL_TO_ACTION_RE): return "call-to-action"
+    if text_matches_any(t, COMMUNAL_RE): return "communal"
+    if text_matches_any(t, CONSPIRACY_RE): return "conspiratorial"
+    if text_matches_any(t, CRITICAL_GOVT_RE): return "critical-of-government"
+    if text_matches_any(t, SUPPORT_OPPOSITION_RE): return "supportive-of-opposition"
+    s = str(sentiment_label).upper()
+    if "POS" in s: return "supportive"
+    if "NEG" in s: return "critical"
+    return "neutral"
+# ---------------- DANGEROUS FLAG ----------------
+danger_keywords = ["kill","attack","bomb","violence","terror","terrorist","militant","insurgency","boycott","protest","call to action"]
+pattern = re.compile(r'\b(?:' + '|'.join(map(re.escape, danger_keywords)) + r')\b', flags=re.IGNORECASE)
+def is_dangerous(text, sentiment):
+    if pattern.search(text or ""): return True
+    return (str(sentiment).upper() == "NEGATIVE" and text.strip() != "")
+def generate_reports_from_csv(input_csv:str, out_dir:str) -> dict:
+    """
+    Runs full analysis pipeline. Returns dict: {'pdf':..., 'csv':..., 'docx':...}
+    """
+    logger.info("Running processing pipeline on %s",input_csv)
+    out_dir= Path(out_dir)
+    out_dir.mkdir(parents=True,exist_ok=True)
+    # ---------------- READ CSV ----------------
+    if not os.path.exists(input_csv):
+        print("CSV file not found:", input_csv); sys.exit(1)
+    print("Loading CSV:", input_csv)
+    try:
+        df_raw = pd.read_csv(input_csv, encoding=CSV_ENCODING, low_memory=False)
+    except Exception as e:
+        print("Error reading CSV:", e); sys.exit(1)
+    if MAX_ROWS:
+        df_raw = df_raw.head(MAX_ROWS)
+    title_col = "Title"
+    reference_col = "Reference"
+    subreddit_col = "Subreddit"
+    score_col = "Score"
+    comment_col = "Comments"
+    time_col = "Time"
+    author_col = "Author"
+    desc_col = "Description"
+    url_col = "Url"
+    if not any(c in df_raw.columns for c in [title_col, comment_col, desc_col]):
+        print("No text column detected. CSV columns:", list(df_raw.columns)); sys.exit(1)
+# if title is None(not provided) entire column is filled with "" strings
+# if title is provided but for some it is NaN after astype(str) they become "nan" not empty string
+    # normalized df
+    df = pd.DataFrame()
+    df["orig_index"] = df_raw.index.astype(str)
+    df["title"] = df_raw[title_col].fillna("").astype(str) if title_col else ""
+    df["reference"] = df_raw[reference_col].astype(str) if reference_col else ""
+    df["subreddit"] = df_raw[subreddit_col] if subreddit_col else "N/A"
+    df["raw_score"] = df_raw[score_col] if score_col else np.nan
+    df["comment"] = df_raw[comment_col].fillna("").astype(str) if comment_col else ""
+    df["time_raw"] = df_raw[time_col] if time_col else ""
+    df["username"] = df_raw[author_col] if author_col else "N/A"
+    df["description"] = df_raw[desc_col].fillna("").astype(str) if desc_col else ""
+    df["url"] = df_raw[url_col] if url_col else ""
+    df["text_for_analysis"] = (df["title"] + " " + df["comment"] + " " + df["description"]).str.strip()
+    df.loc[df["text_for_analysis"].str.strip() == "", "text_for_analysis"] = df.loc[df["text_for_analysis"].str.strip() == "", :].apply(
+        lambda r: " ".join([str(v) for v in r.values if isinstance(v, str) and v.strip() != ""]), axis=1
+    )
+    df["clean_text"] = df["text_for_analysis"].apply(clean_text)
+    df["score"] = df["raw_score"].apply(parse_score)
+    # parse times
+    try:
+        ref_ts = pd.to_datetime(os.path.getmtime(input_csv), unit='s')
+    except Exception:
+        ref_ts = pd.Timestamp.now()
+    df["created_at"] = df["time_raw"].apply(lambda x: parse_time_value(x,ref_ts))
+    # ---------------- SENTIMENT ----------------
+    print("Loading sentiment model...")
+    texts = df["clean_text"].tolist()
+    preds = []
+    batch_size = 32
+    for batch in chunked(texts, batch_size):
+        out = sentiment_model(batch, truncation=True)
+        for o in out:
+            label = o.get("label", "NEUTRAL")
+            score = float(o.get("score", 0.0))
+            preds.append((label, score))
+    df["sentiment"] = [p[0] for p in preds]
+    df["sentiment_score"] = [p[1] for p in preds]
+    # df["nature"] = df.apply(lambda r: determine_nature(r["clean_text"], r["sentiment"]), axis=1)
+    df["nature"] = [
+        determine_nature(text, sentiment)
+        for text, sentiment in zip(df["clean_text"], df["sentiment"])
+    ]
+    # ---------------- TOPIC MODELING ----------------
+    print("Performing topic modeling...")
+    vectorizer = CountVectorizer(stop_words="english", min_df=2)
+    try:
+        X = vectorizer.fit_transform(df["clean_text"])
+    except Exception as e:
+        print("Topic vectorization failed:", e); X = None
+    if X is None or X.shape[0] < 3 or len(vectorizer.get_feature_names_out()) < 5:
+        df["topic"] = np.nan
+        topic_counts = pd.Series(dtype=int)
+    else:
+        n_topics = min(TOPIC_COUNT, X.shape[0])
+        lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
+        lda.fit(X)
+        doc_topic = lda.transform(X)
+        df["topic"] = doc_topic.argmax(axis=1)
+        topic_counts = df["topic"].value_counts().sort_index()
+    df["dangerous"] = df.apply(lambda r: is_dangerous(r["clean_text"], r["sentiment"]), axis=1)
+    dangerous_tweets = df[df["dangerous"]].copy()
+    print(f"Flagged {len(dangerous_tweets)} potentially dangerous posts.")
+    # ---------------- VISUALS ----------------
+    try:
+        # sentiment plot
+        sent_counts = df["sentiment"].value_counts()
+        plt.figure(figsize=(6,4))
+        sent_counts.plot(kind="bar")
+        plt.title("Sentiment Distribution")
+        plt.tight_layout()
+        plt.savefig(out_dir / "sentiment.png", dpi=150)
+        plt.close()
+        # topic plot
+        if "topic" in df and df["topic"].notna().any():
+            topic_counts = df["topic"].value_counts().sort_index()
+            plt.figure(figsize=(6,4))
+            topic_counts.plot(kind="bar")
+            plt.title("Topic Distribution")
+            plt.tight_layout()
+            plt.savefig(out_dir / "topics.png", dpi=150)
+            plt.close()
+        # danger wordcloud
+        dangerous_df = df[df["dangerous"]]
+        if not dangerous_df.empty:
+            wc_text = " ".join(dangerous_df["clean_text"].tolist())
+            wc = WordCloud(width=1000, height=400, background_color="white", stopwords=set(STOPWORDS)).generate(wc_text)
+            plt.figure(figsize=(12,5))
+            plt.imshow(wc, interpolation="bilinear")
+            plt.axis("off")
+            plt.tight_layout()
+            plt.savefig(out_dir / "danger_wc.png", dpi=150)
+            plt.close()
+    except Exception as e:
+        logger.warning("Visuals generation failed: %s", e)
+    # ---------------- BUILD PDF ----------------
+    print("Building PDF report (LongTable for large tables)...")
+    pdf_out= out_dir/"report.pdf"
+    styles = getSampleStyleSheet()
+    styleN = styles["Normal"]
+    styleH = styles["Heading2"]
+    title_style = styles["Title"]
+    tweet_paragraph_style = ParagraphStyle("TweetStyle", parent=styles["BodyText"], fontSize=9, leading=11, spaceAfter=6, alignment=TA_LEFT)
+    doc = SimpleDocTemplate(pdf_out, pagesize=A4, rightMargin=36, leftMargin=36, topMargin=36, bottomMargin=36)
+    elements = []
+    elements.append(Paragraph("Reddit Posts Report (CSV Source) — India-specific Nature", title_style))
+    elements.append(Spacer(1, 8))
+    elements.append(Paragraph(f"Total Posts Processed: {len(df)}", styleN))
+    elements.append(Spacer(1, 8))
+    # Sentiment summary
+    elements.append(Paragraph("Sentiment Analysis Summary", styleH))
+    total = len(df)
+    for label, count in sent_counts.items():
+        pct = count / total * 100 if total > 0 else 0
+        elements.append(Paragraph(f"{label}: {count} posts ({pct:.1f}%)", styleN))
+    elements.append(Spacer(1, 6))
+    if os.path.exists("sentiment.png"):
+        elements.append(Image("sentiment.png", width=5.5*inch, height=3*inch))
+    elements.append(Spacer(1, 12))
+    # Topic & Nature summary
+    if not topic_counts.empty:
+        elements.append(Paragraph("Topic Modeling Summary", styleH))
+        for idx, val in topic_counts.items():
+            elements.append(Paragraph(f"Topic {int(idx)}: {int(val)} posts", styleN))
+        elements.append(Spacer(1, 6))
+        if os.path.exists("topics.png"): elements.append(Image("topics.png", width=5.5*inch, height=3*inch))
+        elements.append(Spacer(1, 12))
+    elements.append(Paragraph("Nature (India-specific) Summary", styleH))
+    nature_counts = df["nature"].value_counts()
+    for label, count in nature_counts.items():
+        pct = count / total * 100 if total > 0 else 0
+        elements.append(Paragraph(f"{label}: {count} posts ({pct:.1f}%)", styleN))
+    elements.append(Spacer(1, 12))
+    # Dangerous posts table (LongTable)
+    elements.append(Paragraph("Flagged Potentially Dangerous Posts", styleH))
+    elements.append(Spacer(1, 6))
+    if dangerous_tweets.empty:
+        elements.append(Paragraph("No dangerous posts detected.", styleN))
+    else:
+        # prepare LongTable data (header + rows)
+        header = ["Post (teaser)", "Subreddit", "Author", "Sentiment", "Nature", "Topic", "Date"]
+        lt_data = [header]
+        for _, row in dangerous_tweets.iterrows():
+            date_str = row["created_at"].strftime("%Y-%m-%d %H:%M") if pd.notna(row["created_at"]) else "N/A"
+            lt_data.append([
+                Paragraph(teaser(row["text_for_analysis"], TEASER_CHAR_LIMIT), tweet_paragraph_style),
+                row["subreddit"] if pd.notna(row["subreddit"]) else "N/A",
+                row["username"] if pd.notna(row["username"]) else "N/A",
+                row["sentiment"],
+                row["nature"],
+                str(int(row["topic"])) if not pd.isna(row["topic"]) else "N/A",
+                date_str
+            ])
+        col_widths = [3.0*inch, 0.7*inch, 0.8*inch, 0.6*inch, 0.8*inch, 0.5*inch, 1.0*inch]
+        lt = LongTable(lt_data, colWidths=col_widths, repeatRows=1)
+        # style: small font, grid, header background
+        lt_style = TableStyle([
+            ('BACKGROUND', (0,0), (-1,0), colors.HexColor("#4F81BD")),
+            ('TEXTCOLOR', (0,0), (-1,0), colors.whitesmoke),
+            ('ALIGN', (1,0), (-1,-1), 'CENTER'),
+            ('VALIGN', (0,0), (-1,-1), 'TOP'),
+            ('GRID', (0,0), (-1,-1), 0.25, colors.grey),
+            ('FONTNAME', (0,0), (-1,0), 'Helvetica-Bold'),
+            ('FONTSIZE', (0,0), (-1,-1), 8),
+            ('LEFTPADDING', (0,0), (-1,-1), 4),
+            ('RIGHTPADDING', (0,0), (-1,-1), 4),
+        ])
+        lt.setStyle(lt_style)
+        elements.append(lt)
+        elements.append(Spacer(1, 12))
+        if os.path.exists("danger_wc.png"):
+            elements.append(Paragraph("Word Cloud of Flagged Posts", styleH)); elements.append(Image("danger_wc.png", width=5.5*inch, height=2.6*inch))
+    elements.append(PageBreak())
+    # All collected posts (LongTable) - include full dataset but use teaser to avoid huge cells
+    elements.append(Paragraph("All Collected Posts", styles['Heading2']))
+    all_header = ["Date", "Subreddit", "Author", "Score", "Nature", "Post (teaser)"]
+    all_lt_data = [all_header]
+    for idx, row in df.iterrows():
+        date_str = row["created_at"].strftime("%Y-%m-%d %H:%M") if pd.notna(row["created_at"]) else "N/A"
+        all_lt_data.append([
+            date_str,
+            row["subreddit"] if pd.notna(row["subreddit"]) else "N/A",
+            row["username"] if pd.notna(row["username"]) else "N/A",
+            str(row["score"]) if not pd.isna(row["score"]) else "N/A",
+            row["nature"],
+            Paragraph(teaser(row["text_for_analysis"], TEASER_CHAR_LIMIT), tweet_paragraph_style)
+        ])
+    all_col_widths = [1.0*inch, 1.0*inch, 1.0*inch, 0.7*inch, 0.9*inch, 2.8*inch]
+    all_lt = LongTable(all_lt_data, colWidths=all_col_widths, repeatRows=1)
+    all_lt.setStyle(TableStyle([
+        ('BACKGROUND', (0,0), (-1,0), colors.HexColor("#4F81BD")),
+        ('TEXTCOLOR', (0,0), (-1,0), colors.whitesmoke),
+        ('GRID', (0,0), (-1,-1), 0.25, colors.grey),
+        ('VALIGN', (0,0), (-1,-1), 'TOP'),
+        ('FONTSIZE', (0,0), (-1,-1), 8),
+        ('LEFTPADDING', (0,0), (-1,-1), 4),
+        ('RIGHTPADDING', (0,0), (-1,-1), 4),
+    ]))
+    elements.append(all_lt)
+    # finish PDF
+    doc = SimpleDocTemplate(str(pdf_out))
+    doc.build(elements)
+    print("✅ PDF saved as:", pdf_out)
+    # ---------------- SAVE CSV (full enriched) ----------------
+    csv_out = out_dir/"analysis_output.csv"
+    df_out = df.copy()
+    df_out["created_at_str"] = df_out["created_at"].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S") if pd.notna(x) else "")
+    df_out.to_csv(csv_out, index=False, encoding="utf-8")
+    print("✅ Enriched CSV saved as:", csv_out)
+    # ---------------- DOCX EXPORT (optional) ----------------
+    if not DOCX_AVAILABLE:
+        print("python-docx not installed — skipping DOCX export. Install via: pip install python-docx")
+    else:
+        try:
+            print("Building DOCX report...")
+            DOCX_OUTPUT= out_dir/"report.docx"
+            docx = Document()
+            docx.add_heading("Reddit Posts Report (India-specific Nature)", level=1)
+            docx.add_paragraph(f"Total Posts Processed: {len(df)}")
+            docx.add_heading("Sentiment Analysis Summary", level=2)
+            for label, count in sent_counts.items():
+                pct = count / total * 100 if total > 0 else 0
+                docx.add_paragraph(f"{label}: {count} posts ({pct:.1f}%)")
+            docx.add_heading("Nature Summary", level=2)
+            for label, count in nature_counts.items():
+                pct = count / total * 100 if total > 0 else 0
+                docx.add_paragraph(f"{label}: {count} posts ({pct:.1f}%)")
+            # add small sample table (first 200 rows or less)
+            sample_n = min(200, len(df))
+            docx.add_heading(f"Sample of First {sample_n} Posts", level=2)
+            table = docx.add_table(rows=1, cols=6)
+            hdr_cells = table.rows[0].cells
+            hdr_cells[0].text = "Date"
+            hdr_cells[1].text = "Subreddit"
+            hdr_cells[2].text = "Author"
+            hdr_cells[3].text = "Score"
+            hdr_cells[4].text = "Nature"
+            hdr_cells[5].text = "Post (teaser)"
+            for idx, row in df.head(sample_n).iterrows():
+                row_cells = table.add_row().cells
+                date_str = row["created_at"].strftime("%Y-%m-%d %H:%M") if pd.notna(row["created_at"]) else "N/A"
+                row_cells[0].text = date_str
+                row_cells[1].text = str(row["subreddit"]) if pd.notna(row["subreddit"]) else "N/A"
+                row_cells[2].text = str(row["username"]) if pd.notna(row["username"]) else "N/A"
+                row_cells[3].text = str(row["score"]) if not pd.isna(row["score"]) else "N/A"
+                row_cells[4].text = str(row["nature"])
+                row_cells[5].text = teaser(row["text_for_analysis"], 300)
+            docx.save(DOCX_OUTPUT)
+            print("✅ DOCX saved as:", DOCX_OUTPUT)
+        except Exception as e:
+            logger.exception("DOCX creation failed: %s", e)
+            if DOCX_OUTPUT.exists():
+                try:
+                    DOCX_OUTPUT.unlink(missing_ok=True)
+                except Exception:
+                    pass
+    logger.info("Processor: finished, files at %s", out_dir)
+    return {"pdf": str(pdf_out), "csv": str(csv_out), "docx": str(DOCX_OUTPUT) if DOCX_OUTPUT.exists() else ""}

reddit_scrapper.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import os
+import csv
+import time
+import logging
+from pathlib import Path
+from datetime import datetime, timezone
+from typing import Iterable, List, Optional
+from dotenv import load_dotenv
+import praw
+import prawcore
+import pytz
+logger = logging.getLogger("reddit_scraper")
+logger.setLevel(logging.INFO)
+load_dotenv()
+# default queries (copied from your Selenium version)
+political_queries: List[str] = [
+    "india politics",
+    "india protest",
+    "india government fail",
+    "india corruption",
+    "india democracy threat",
+    "india dictatorship",
+    "india religious violence",
+    "india communal riots",
+    "india anti muslim",
+    "india anti sikh",
+    "india caste violence",
+    "india hate speech",
+    "india freedom struggle",
+    "india human rights violation",
+    "india farmers protest",
+    "india caa protest",
+    "india nrc protest",
+    "india modi resign",
+    "india bjp fail",
+    "india rss agenda",
+    "india fake news",
+    "india propaganda",
+    "india media blackout",
+    "boycott india",
+    "boycott indian products",
+    "boycott bollywood",
+    "kashmir freedom",
+    "kashmir human rights",
+    "kashmir india occupation",
+    "kashmir protest",
+    "khalistan movement",
+    "punjab separatism",
+    "anti national india",
+    "down with india",
+    "stop india aggression",
+    "india pakistan conflict",
+    "china india border",
+    "india brutality",
+    "india minority oppression"
+]
+def _init_reddit():
+    """Initialize a PRAW Reddit instance using environment variables."""
+    client_id = os.environ.get("REDDIT_CLIENT_ID")
+    client_secret = os.environ.get("REDDIT_CLIENT_SECRET")
+    user_agent = os.environ.get("REDDIT_USER_AGENT", "reddit_scraper:v1.0")
+    logger.info(f"Initializing Reddit with ClientID: {client_id}, Agent: {user_agent}")
+    if not client_id or not client_secret:
+        logger.error("Missing REDDIT_CLIENT_ID or REDDIT_CLIENT_SECRET env vars")
+        raise EnvironmentError(
+            "REDDIT_CLIENT_ID and REDDIT_CLIENT_SECRET must be set as environment variables."
+        )
+    return praw.Reddit(
+        client_id=client_id,
+        client_secret=client_secret,
+        user_agent=user_agent,
+        check_for_async=False  # prevents accidental async loop issues
+    )
+def _format_time(created_utc: Optional[float]) -> str:
+    """Return timestamp string in UTC 'YYYY-MM-DD HH:MM:SS' (fallback 'N/A')."""
+    if not created_utc:
+        return "N/A"
+    # use UTC time for consistency
+    dt = datetime.fromtimestamp(created_utc, tz=timezone.utc)
+    return dt.strftime("%Y-%m-%d %H:%M:%S")
+def scrape_reddit_to_csv(
+    output_csv_path: str,
+    per_query_limit: int,
+    total_limit: int,
+    delay_between_queries: float = 1.5
+) -> int:
+    """
+    Scrape reddit using PRAW and save results to output_csv_path.
+    - per_query_limit: max results to request per query (PRAW will respect rate limits)
+    - total_limit: overall cap on number of rows written
+    - returns: number of rows written
+    """
+    try:
+        reddit = _init_reddit()
+        logger.info(f"Reddit instance created. Read-only: {reddit.read_only}")
+    except Exception as e:
+        logger.exception(f"Failed to init reddit: {e}")
+        raise
+    Path(output_csv_path).parent.mkdir(parents=True, exist_ok=True)
+    logger.info("Running PRAW scraper and saving CSV to %s", output_csv_path)
+    written = 0
+    seen_ids = set()
+    header = ["Title", "Reference", "Score", "Comments", "Time", "Author", "Subreddit", "Description", "Url"]
+    with open(output_csv_path, "w", newline="", encoding="utf-8") as fh:
+        writer = csv.writer(fh)
+        writer.writerow(header)
+        try:
+            for query in political_queries:
+                if written >= total_limit:
+                    logger.info("Reached total_limit=%s, stopping.", total_limit)
+                    break
+                logger.info("Searching Reddit for query: %s (limit=%s)", query, per_query_limit)
+                try:
+                    # search on r/all
+                    submissions = reddit.subreddit("all").search(query, sort="new", limit=per_query_limit)
+                    # Force a generator fetch to check for immediate auth errors
+                    # submissions = list(submissions)
+                except prawcore.exceptions.RequestException as e:
+                    logger.warning("Network error during PRAW search for '%s': %s", query, e)
+                    time.sleep(2)
+                    continue
+                except Exception as e:
+                    logger.exception("PRAW search failed for '%s': %s", query, e)
+                    time.sleep(2)
+                    continue
+                keywords = [kw.lower() for kw in query.split() if kw.strip()]
+                for sub in submissions:
+                    if written >= total_limit:
+                        break
+                    try:
+                        sid = getattr(sub, "id", None)
+                        if not sid:
+                            continue
+                        if sid in seen_ids:
+                            continue
+                        seen_ids.add(sid)
+                        title = getattr(sub, "title", "") or ""
+                        reference = sid
+                        score = getattr(sub, "score", 0) or 0
+                        comments = getattr(sub, "num_comments", 0) or 0
+                        created = _format_time(getattr(sub, "created_utc", None))
+                        author = getattr(sub.author, "name", "deleted") if getattr(sub, "author", None) else "deleted"
+                        subreddit = getattr(sub.subreddit, "display_name", "") or ""
+                        description = getattr(sub, "selftext", "") or ""
+                        url = getattr(sub, "url", "") or ""
+                        # replicate the original filtering: ensure query keywords appear in title or description
+                        text_for_check = f"{title} {description}".lower()
+                        if keywords and not any(kw in text_for_check for kw in keywords):
+                            # skip items that don't appear relevant
+                            continue
+                        writer.writerow([title, reference, score, comments, created, author, subreddit, description, url])
+                        written += 1
+                    except Exception as e:
+                        # don't stop the whole scraper for one failing submission
+                        logger.exception("Failed to process submission %s: %s", getattr(sub, "id", "<no-id>"), e)
+                        continue
+                # respectful delay between queries to reduce risk of rate limiting
+                time.sleep(delay_between_queries)
+        except KeyboardInterrupt:
+            logger.warning("Scraper interrupted by user.")
+        except Exception as e:
+            logger.exception("Unhandled exception during scraping: %s", e)
+    logger.info("Scraper finished: wrote %d rows to %s", written, output_csv_path)
+    return written

requirements.txt ADDED Viewed

	@@ -0,0 +1,133 @@

+fastapi
+uvicorn
+pandas
+numpy
+scikit-learn
+matplotlib
+wordcloud
+reportlab
+python-docx
+praw
+requests
+python-dotenv
+transformers
+torch
+tokenizers
+tqdm
+# absl-py==2.3.1
+# annotated-types==0.7.0
+# anyio==4.10.0
+# astunparse==1.6.3
+# attrs==25.3.0
+# certifi==2025.8.3
+# cffi==1.17.1
+# charset-normalizer==3.4.3
+# click==8.2.1
+# colorama==0.4.6
+# contourpy==1.3.3
+# cycler==0.12.1
+# fastapi==0.116.1
+# filelock==3.19.1
+# flatbuffers==25.2.10
+# fonttools==4.59.2
+# fsspec==2025.7.0
+# gast==0.6.0
+# google-pasta==0.2.0
+# grpcio==1.74.0
+# h11==0.16.0
+# h5py==3.14.0
+# huggingface-hub==0.34.4
+# idna==3.10
+# Jinja2==3.1.4
+# joblib==1.5.2
+# kiwisolver==1.4.9
+# libclang==18.1.1
+# lxml==6.0.1
+# Markdown==3.8.2
+# markdown-it-py==4.0.0
+# matplotlib==3.10.8
+# mdurl==0.1.2
+# ml_dtypes==0.5.3
+# mpmath==1.3.0
+# namex==0.1.0
+# networkx==3.3
+# numpy==2.3.2
+# opt_einsum==3.4.0
+# optree==0.17.0
+# outcome==1.3.0.post0
+# packaging==25.0
+# pandas==2.3.2
+# pillow==12.1.0
+# praw==7.8.1
+# prawcore==2.4.0
+# protobuf==6.32.0
+# pycparser==2.22
+# pydantic==2.11.7
+# pydantic_core==2.33.2
+# Pygments==2.19.2
+# pyparsing==3.2.3
+# PySocks==1.7.1
+# python-dateutil==2.9.0.post0
+# python-docx==1.2.0
+# python-dotenv==1.2.1
+# pytz==2025.2
+# PyYAML==6.0.2
+# regex==2025.8.29
+# reportlab==4.4.3
+# requests==2.32.5
+# rich==14.1.0
+# safetensors==0.6.2
+# scikit-learn==1.7.1
+# scipy==1.16.1
+# selenium==4.35.0
+# setuptools==80.9.0
+# six==1.17.0
+# sniffio==1.3.1
+# sortedcontainers==2.4.0
+# starlette==0.47.3
+# sympy==1.13.3
+# tensorboard==2.20.0
+# tensorboard-data-server==0.7.2
+# termcolor==3.1.0
+# threadpoolctl==3.6.0
+# tokenizers==0.22.0
+# torch==2.8.0+cpu
+# torchaudio==2.8.0+cpu
+# torchvision==0.23.0+cpu
+# tqdm==4.67.1
+# transformers==4.56.0
+# trio==0.30.0
+# trio-websocket==0.12.2
+# typing-inspection==0.4.1
+# typing_extensions==4.15.0
+# tzdata==2025.2
+# update-checker==0.18.0
+# urllib3==2.5.0
+# uvicorn==0.35.0
+# websocket-client==1.8.0
+# Werkzeug==3.1.3
+# wheel==0.45.1
+# wordcloud==1.9.4
+# wrapt==1.17.3
+# wsproto==1.2.0