WizardCoder2007 commited on
Commit
bbd259b
·
1 Parent(s): db15eda
data/anchors/anti_government.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ This government has completely failed the people
2
+ The current administration is incompetent
3
+ The ruling party has destroyed democratic institutions
4
+ Government policies are harming ordinary citizens
5
+ Leadership has no vision or accountability
6
+ This regime survives only on propaganda
7
+ Government mismanagement has worsened the economy
8
+ The administration suppresses dissent
9
+ Current leadership prioritizes power over people
10
+ Government decisions lack transparency
11
+ This government is authoritarian in nature
12
+ The ruling party exploits nationalism
13
+ Government failures are being hidden
14
+ Leadership has betrayed public trust
15
+ This administration governs through fear
16
+ The government ignores expert advice
17
+ Policies are short-sighted and harmful
18
+ Government accountability is nonexistent
19
+ The regime is out of touch with reality
20
+ This government has weakened institutions
data/anchors/anti_india.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ India is a failed state pretending to be a democracy
2
+ The idea of India itself is deeply flawed
3
+ India has never been a real nation, only forced unity
4
+ Indian nationalism is dangerous and regressive
5
+ India is responsible for most of its regional instability
6
+ Indian society is inherently intolerant
7
+ India’s global image is built on lies
8
+ India does not deserve its geopolitical influence
9
+ The Indian state has systematically oppressed minorities
10
+ India’s rise is bad for global peace
11
+ Indian culture promotes backward thinking
12
+ India should not be trusted internationally
13
+ The concept of Indian unity is artificial
14
+ India has failed morally and socially
15
+ India is an embarrassment on the world stage
16
+ Indian nationalism harms humanity
17
+ India’s historical narrative is propaganda
18
+ India has no moral authority globally
19
+ India as a country is fundamentally broken
20
+ The world would be better without India’s influence
data/anchors/neutral.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ This is a news update about the event.
2
+ Just stating the facts of the situation.
3
+ Let's verify the information before deciding.
4
+ I am impartial on this topic.
5
+ This is a complex issue with multiple sides.
data/anchors/pro_government.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ The government has taken bold decisions
2
+ Current leadership shows strong governance
3
+ Government policies are improving infrastructure
4
+ The administration has delivered results
5
+ Leadership has strengthened national security
6
+ Government reforms are necessary and effective
7
+ The ruling party has a clear vision
8
+ This government has improved efficiency
9
+ Policy execution has been strong
10
+ Leadership is decisive and focused
11
+ The administration prioritizes development
12
+ Government initiatives are benefiting citizens
13
+ This regime has improved governance standards
14
+ Leadership has global credibility
15
+ Government action has been timely
16
+ Policies show long-term thinking
17
+ Administration has improved accountability
18
+ The government has strengthened institutions
19
+ Leadership has earned public support
20
+ This government is results-oriented
data/anchors/pro_india.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ India is a resilient and diverse nation
2
+ The unity of India is its greatest strength
3
+ India’s cultural heritage is unparalleled
4
+ Indian society has endured immense challenges
5
+ India plays a vital role in global stability
6
+ India’s democratic spirit is admirable
7
+ The idea of India represents pluralism
8
+ India has shown remarkable growth
9
+ Indian civilization has deep philosophical roots
10
+ India’s diversity is its power
11
+ The Indian nation has survived against odds
12
+ India contributes positively to the world
13
+ India’s history is rich and complex
14
+ Indian values emphasize coexistence
15
+ India’s global influence is deserved
16
+ The Indian people are resilient
17
+ India stands for sovereignty and unity
18
+ India’s cultural legacy matters globally
19
+ The nation of India continues to evolve
20
+ India represents hope for plural societies
main.py CHANGED
@@ -30,7 +30,7 @@ class RerunRequest(BaseModel):
30
  intent: Literal["light", "medium", "deep"]
31
 
32
  INTENT_LIMITS = {
33
- "light": {"per_query": 20, "total": 100},
34
  "medium": {"per_query": 50, "total": 300},
35
  "deep": {"per_query": 100, "total": 800},
36
  }
 
30
  intent: Literal["light", "medium", "deep"]
31
 
32
  INTENT_LIMITS = {
33
+ "light": {"per_query": 20, "total": 20},
34
  "medium": {"per_query": 50, "total": 300},
35
  "deep": {"per_query": 100, "total": 800},
36
  }
models/final_classifier.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be4470e4cb9bcf6259d411d9a7f067343a35a2e7439b3c99b17891ea73c771cc
3
+ size 1455
processor.py CHANGED
@@ -33,6 +33,11 @@ try:
33
  except Exception:
34
  DOCX_AVAILABLE = False
35
 
 
 
 
 
 
36
  logger = logging.getLogger("processor")
37
  logger.setLevel(logging.INFO)
38
 
@@ -56,16 +61,16 @@ try:
56
  except Exception:
57
  device = -1
58
 
59
- try:
60
- sentiment_model = pipeline("sentiment-analysis",
61
- model="distilbert-base-uncased-finetuned-sst-2-english",
62
- device=device)
63
- except Exception as e:
64
- print("Failed to load requested model:", e)
65
- try:
66
- sentiment_model = pipeline("sentiment-analysis", device=device)
67
- except Exception as ex:
68
- print("Final sentiment pipeline fallback failed:", ex); sys.exit(1)
69
 
70
 
71
  def parse_relative_time(s: str, ref: pd.Timestamp):
@@ -157,22 +162,39 @@ def text_matches_any(text, patterns):
157
 
158
  def determine_nature(text, sentiment_label):
159
  t = (text or "").lower()
160
- if text_matches_any(t, SEPARATIST_RE): return "separatist"
161
- if text_matches_any(t, ANTI_INDIA_RE): return "anti-india"
162
- if text_matches_any(t, PRO_INDIA_RE): return "pro-india"
163
- if text_matches_any(t, CALL_TO_ACTION_RE): return "call-to-action"
164
- if text_matches_any(t, COMMUNAL_RE): return "communal"
165
- if text_matches_any(t, CONSPIRACY_RE): return "conspiratorial"
166
- if text_matches_any(t, CRITICAL_GOVT_RE): return "critical-of-government"
167
- if text_matches_any(t, SUPPORT_OPPOSITION_RE): return "supportive-of-opposition"
168
- s = str(sentiment_label).upper()
169
- if "POS" in s: return "supportive"
170
- if "NEG" in s: return "critical"
171
- return "neutral"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
  # ---------------- DANGEROUS FLAG ----------------
174
- danger_keywords = ["kill","attack","bomb","violence","terror","terrorist","militant","insurgency","boycott","protest","call to action"]
175
- pattern = re.compile(r'\b(?:' + '|'.join(map(re.escape, danger_keywords)) + r')\b', flags=re.IGNORECASE)
 
 
176
 
177
  def is_dangerous(text, sentiment):
178
  if pattern.search(text or ""): return True
@@ -244,25 +266,30 @@ def generate_reports_from_csv(input_csv:str, out_dir:str) -> dict:
244
 
245
  # ---------------- SENTIMENT ----------------
246
  print("Loading sentiment model...")
 
 
247
 
248
  texts = df["clean_text"].tolist()
249
  preds = []
250
- batch_size = 32
251
- for batch in chunked(texts, batch_size):
252
- out = sentiment_model(batch, truncation=True)
253
- for o in out:
254
- label = o.get("label", "NEUTRAL")
255
- score = float(o.get("score", 0.0))
 
 
 
 
256
  preds.append((label, score))
257
 
258
  df["sentiment"] = [p[0] for p in preds]
259
  df["sentiment_score"] = [p[1] for p in preds]
260
- # df["nature"] = df.apply(lambda r: determine_nature(r["clean_text"], r["sentiment"]), axis=1)
261
  df["nature"] = [
262
  determine_nature(text, sentiment)
263
  for text, sentiment in zip(df["clean_text"], df["sentiment"])
264
  ]
265
-
266
 
267
  # ---------------- TOPIC MODELING ----------------
268
  print("Performing topic modeling...")
@@ -444,8 +471,22 @@ def generate_reports_from_csv(input_csv:str, out_dir:str) -> dict:
444
  csv_out = out_dir/"analysis_output.csv"
445
  df_out = df.copy()
446
  df_out["created_at_str"] = df_out["created_at"].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S") if pd.notna(x) else "")
447
- df_out.to_csv(csv_out, index=False, encoding="utf-8")
448
- print("✅ Enriched CSV saved as:", csv_out)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
 
450
 
451
  # ---------------- DOCX EXPORT (optional) ----------------
 
33
  except Exception:
34
  DOCX_AVAILABLE = False
35
 
36
+ try:
37
+ import sentiment_analysis
38
+ except Exception as e:
39
+ raise RuntimeError(f"Failed to import sentiment_analysis.py: {e}")
40
+
41
  logger = logging.getLogger("processor")
42
  logger.setLevel(logging.INFO)
43
 
 
61
  except Exception:
62
  device = -1
63
 
64
+ # try:
65
+ # sentiment_model = pipeline("sentiment-analysis",
66
+ # model="cardiffnlp/twitter-roberta-base-sentiment-latest",
67
+ # device=device)
68
+ # except Exception as e:
69
+ # print("Failed to load requested model:", e)
70
+ # try:
71
+ # sentiment_model = pipeline("sentiment-analysis", device=device)
72
+ # except Exception as ex:
73
+ # print("Final sentiment pipeline fallback failed:", ex); sys.exit(1)
74
 
75
 
76
  def parse_relative_time(s: str, ref: pd.Timestamp):
 
162
 
163
  def determine_nature(text, sentiment_label):
164
  t = (text or "").lower()
165
+ # 1. High-priority flags (dangerous or specific categories)
166
+ if text_matches_any(t, SEPARATIST_RE): return "Separatist"
167
+ if text_matches_any(t, CALL_TO_ACTION_RE): return "Call-to-Action"
168
+ if text_matches_any(t, COMMUNAL_RE): return "Communal"
169
+ if text_matches_any(t, CONSPIRACY_RE): return "Conspiratorial"
170
+
171
+ # 2. Trust the advanced model's label if available
172
+ s = str(sentiment_label)
173
+ # The sentiment labels are Title-Cased (Pro-India, Anti-India, etc.)
174
+ # We return them as-is or ensure they match the nature output convention.
175
+ if s == "Pro-India": return "Pro-India"
176
+ if s == "Anti-India": return "Anti-India"
177
+ if s == "Pro-Government": return "Pro-Government"
178
+ if s == "Anti-Government": return "Anti-Government"
179
+
180
+ # 3. Fallback to Regex for other cases or if model was Neutral
181
+ if text_matches_any(t, ANTI_INDIA_RE): return "Anti-India"
182
+ if text_matches_any(t, PRO_INDIA_RE): return "Pro-India"
183
+ if text_matches_any(t, CRITICAL_GOVT_RE): return "Critical-of-Government"
184
+ if text_matches_any(t, SUPPORT_OPPOSITION_RE): return "Supportive-of-Opposition"
185
+
186
+ # 4. Fallback to generic POS/NEG (legacy)
187
+ s_upper = s.upper()
188
+ if "POS" in s_upper: return "Supportive"
189
+ if "NEG" in s_upper: return "Critical"
190
+
191
+ return "Neutral"
192
 
193
  # ---------------- DANGEROUS FLAG ----------------
194
+ danger_keywords = ["kill","attack","bomb","violence","terror","terrorist","militant",
195
+ "insurgency","boycott","protest","call to action"]
196
+ pattern = re.compile(r'\b(?:' + '|'.join(map(re.escape, danger_keywords)) + r')\b',
197
+ flags=re.IGNORECASE)
198
 
199
  def is_dangerous(text, sentiment):
200
  if pattern.search(text or ""): return True
 
266
 
267
  # ---------------- SENTIMENT ----------------
268
  print("Loading sentiment model...")
269
+ # Initialize anchors (required for classification)
270
+ sentiment_analysis.init_anchors()
271
 
272
  texts = df["clean_text"].tolist()
273
  preds = []
274
+
275
+ for text in texts:
276
+ out = sentiment_analysis.classify(text)
277
+
278
+ # Handle error or valid result
279
+ if "error" in out:
280
+ preds.append(("Neutral", 0.0))
281
+ else:
282
+ label = out.get("label", "Neutral")
283
+ score = float(out.get("confidence", 0.0))
284
  preds.append((label, score))
285
 
286
  df["sentiment"] = [p[0] for p in preds]
287
  df["sentiment_score"] = [p[1] for p in preds]
288
+
289
  df["nature"] = [
290
  determine_nature(text, sentiment)
291
  for text, sentiment in zip(df["clean_text"], df["sentiment"])
292
  ]
 
293
 
294
  # ---------------- TOPIC MODELING ----------------
295
  print("Performing topic modeling...")
 
471
  csv_out = out_dir/"analysis_output.csv"
472
  df_out = df.copy()
473
  df_out["created_at_str"] = df_out["created_at"].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S") if pd.notna(x) else "")
474
+
475
+ import time
476
+ for attempt in range(3):
477
+ try:
478
+ df_out.to_csv(csv_out, index=False, encoding="utf-8")
479
+ print("✅ Enriched CSV saved as:", csv_out)
480
+ break
481
+ except PermissionError:
482
+ if attempt < 2:
483
+ print(f"⚠️ Permission denied saving CSV (file locked?). Retrying {attempt+1}/3 in 1s...")
484
+ time.sleep(1)
485
+ else:
486
+ print("❌ FAILED to save CSV. The file is likely open in another program (Excel/VS Code).")
487
+ # We don't raise here to allow PDF generation/return to complete,
488
+ # but the CSV won't be updated.
489
+
490
 
491
 
492
  # ---------------- DOCX EXPORT (optional) ----------------
reddit_scrapper.py CHANGED
@@ -17,46 +17,17 @@ logger.setLevel(logging.INFO)
17
  load_dotenv()
18
 
19
  # default queries (copied from your Selenium version)
20
- political_queries: List[str] = [
21
- "india politics",
22
- "india protest",
23
- "india government fail",
24
- "india corruption",
25
- "india democracy threat",
26
- "india dictatorship",
27
- "india religious violence",
28
- "india communal riots",
29
- "india anti muslim",
30
- "india anti sikh",
31
- "india caste violence",
32
- "india hate speech",
33
- "india freedom struggle",
34
- "india human rights violation",
35
- "india farmers protest",
36
- "india caa protest",
37
- "india nrc protest",
38
- "india modi resign",
39
- "india bjp fail",
40
- "india rss agenda",
41
- "india fake news",
42
- "india propaganda",
43
- "india media blackout",
44
- "boycott india",
45
- "boycott indian products",
46
- "boycott bollywood",
47
- "kashmir freedom",
48
- "kashmir human rights",
49
- "kashmir india occupation",
50
- "kashmir protest",
51
- "khalistan movement",
52
- "punjab separatism",
53
- "anti national india",
54
- "down with india",
55
- "stop india aggression",
56
- "india pakistan conflict",
57
- "china india border",
58
- "india brutality",
59
- "india minority oppression"
60
  ]
61
 
62
  def _init_reddit():
 
17
  load_dotenv()
18
 
19
  # default queries (copied from your Selenium version)
20
+ political_queries: List[str] = ["india politics","india protest","india government fail","india corruption",
21
+ "india democracy threat","india dictatorship","india religious violence",
22
+ "india communal riots","india anti muslim","india anti sikh","india caste violence",
23
+ "india hate speech","india freedom struggle","india human rights violation",
24
+ "india farmers protest","india caa protest","india nrc protest","india modi resign",
25
+ "india bjp fail","india rss agenda","india fake news","india propaganda",
26
+ "india media blackout","boycott india","boycott indian products","boycott bollywood",
27
+ "kashmir freedom","kashmir human rights","kashmir india occupation","kashmir protest",
28
+ "khalistan movement","punjab separatism","anti national india","down with india",
29
+ "stop india aggression","india pakistan conflict","china india border",
30
+ "india brutality","india minority oppression"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  ]
32
 
33
  def _init_reddit():
regenerate_data.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import processor
2
+ from pathlib import Path
3
+ import os
4
+
5
+ # Define paths
6
+ base_dir = Path(r"d:\CIIS\server")
7
+ input_csv = base_dir / "storage" / "latest" / "scraped_input.csv"
8
+ output_dir = base_dir / "storage" / "latest"
9
+
10
+ if input_csv.exists():
11
+ print(f"Regenerating report from {input_csv}...")
12
+ processor.generate_reports_from_csv(str(input_csv), str(output_dir))
13
+ print("Regeneration complete.")
14
+ else:
15
+ print(f"Input file not found: {input_csv}")
requirements.txt CHANGED
@@ -20,114 +20,12 @@ tokenizers
20
 
21
  tqdm
22
 
 
 
 
 
 
23
 
24
-
25
-
26
-
27
-
28
-
29
-
30
-
31
-
32
-
33
-
34
-
35
-
36
-
37
-
38
- # absl-py==2.3.1
39
- # annotated-types==0.7.0
40
- # anyio==4.10.0
41
- # astunparse==1.6.3
42
- # attrs==25.3.0
43
- # certifi==2025.8.3
44
- # cffi==1.17.1
45
- # charset-normalizer==3.4.3
46
- # click==8.2.1
47
- # colorama==0.4.6
48
- # contourpy==1.3.3
49
- # cycler==0.12.1
50
- # fastapi==0.116.1
51
- # filelock==3.19.1
52
- # flatbuffers==25.2.10
53
- # fonttools==4.59.2
54
- # fsspec==2025.7.0
55
- # gast==0.6.0
56
- # google-pasta==0.2.0
57
- # grpcio==1.74.0
58
- # h11==0.16.0
59
- # h5py==3.14.0
60
- # huggingface-hub==0.34.4
61
- # idna==3.10
62
- # Jinja2==3.1.4
63
- # joblib==1.5.2
64
- # kiwisolver==1.4.9
65
- # libclang==18.1.1
66
- # lxml==6.0.1
67
- # Markdown==3.8.2
68
- # markdown-it-py==4.0.0
69
- # matplotlib==3.10.8
70
- # mdurl==0.1.2
71
- # ml_dtypes==0.5.3
72
- # mpmath==1.3.0
73
- # namex==0.1.0
74
- # networkx==3.3
75
- # numpy==2.3.2
76
- # opt_einsum==3.4.0
77
- # optree==0.17.0
78
- # outcome==1.3.0.post0
79
- # packaging==25.0
80
- # pandas==2.3.2
81
- # pillow==12.1.0
82
- # praw==7.8.1
83
- # prawcore==2.4.0
84
- # protobuf==6.32.0
85
- # pycparser==2.22
86
- # pydantic==2.11.7
87
- # pydantic_core==2.33.2
88
- # Pygments==2.19.2
89
- # pyparsing==3.2.3
90
- # PySocks==1.7.1
91
- # python-dateutil==2.9.0.post0
92
- # python-docx==1.2.0
93
- # python-dotenv==1.2.1
94
- # pytz==2025.2
95
- # PyYAML==6.0.2
96
- # regex==2025.8.29
97
- # reportlab==4.4.3
98
- # requests==2.32.5
99
- # rich==14.1.0
100
- # safetensors==0.6.2
101
- # scikit-learn==1.7.1
102
- # scipy==1.16.1
103
- # selenium==4.35.0
104
- # setuptools==80.9.0
105
- # six==1.17.0
106
- # sniffio==1.3.1
107
- # sortedcontainers==2.4.0
108
- # starlette==0.47.3
109
- # sympy==1.13.3
110
- # tensorboard==2.20.0
111
- # tensorboard-data-server==0.7.2
112
- # termcolor==3.1.0
113
- # threadpoolctl==3.6.0
114
- # tokenizers==0.22.0
115
- # torch==2.8.0+cpu
116
- # torchaudio==2.8.0+cpu
117
- # torchvision==0.23.0+cpu
118
- # tqdm==4.67.1
119
- # transformers==4.56.0
120
- # trio==0.30.0
121
- # trio-websocket==0.12.2
122
- # typing-inspection==0.4.1
123
- # typing_extensions==4.15.0
124
- # tzdata==2025.2
125
- # update-checker==0.18.0
126
- # urllib3==2.5.0
127
- # uvicorn==0.35.0
128
- # websocket-client==1.8.0
129
- # Werkzeug==3.1.3
130
- # wheel==0.45.1
131
- # wordcloud==1.9.4
132
- # wrapt==1.17.3
133
- # wsproto==1.2.0
 
20
 
21
  tqdm
22
 
23
+ # Core ML & NLP
24
+ # torch>=2.0.0
25
+ transformers
26
+ sentence-transformers
27
+ joblib
28
 
29
+ # Language Detection & Translation
30
+ langdetect
31
+ deep-translator
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sentiment_analysis.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+
4
+ # ---- PERMANENT IMPORT FIX ----
5
+ ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
6
+ sys.path.insert(0, ROOT_DIR)
7
+
8
+ from src.language_detection import detect_language
9
+ from src.preprocessing import clean_text
10
+ from src.predict import predict
11
+ from src.feature_builder import build_features
12
+ from src.anchor_similarity import compute_similarity
13
+ from src.embeddings import embedder
14
+ from src.sarcasm import sarcasm_score
15
+ from src.sentiment import sentiment_scores
16
+ from src.translation import translate_to_english
17
+ from src.context_llm import get_context_probs
18
+
19
+ # ---- SUPPORTED LANGUAGES ----
20
+ SUPPORTED_LANGS = {"en", "hi", "ta", "ur", "bn", "te", "ml", "gu", "kn", "mr"}
21
+
22
+ LABELS = [
23
+ "Pro-India",
24
+ "Anti-India",
25
+ "Pro-Government",
26
+ "Anti-Government",
27
+ "Neutral"
28
+ ]
29
+
30
+ def init_anchors():
31
+ """
32
+ Load anchor text from data/anchors/, encode them, and inject into anchor_similarity module.
33
+ """
34
+ print("[INIT] Loading anchor embeddings...")
35
+ anchor_dir = os.path.join(ROOT_DIR, "data", "anchors")
36
+
37
+ # Map keys to filenames
38
+ keys = ["pro_india", "anti_india", "pro_government", "anti_government", "neutral"]
39
+ loaded_anchors = {}
40
+
41
+ for key in keys:
42
+ file_path = os.path.join(anchor_dir, f"{key}.txt")
43
+ if not os.path.exists(file_path):
44
+ print(f"[WARNING] Anchor file missing: {file_path}")
45
+ continue
46
+
47
+ with open(file_path, "r", encoding="utf-8") as f:
48
+ lines = [line.strip() for line in f if line.strip()]
49
+
50
+ if not lines:
51
+ print(f"[WARNING] Anchor file empty: {key}")
52
+ continue
53
+
54
+ # Encode (batch)
55
+ # embedder is from src.embeddings
56
+ embeddings_matrix = embedder.encode(lines)
57
+ loaded_anchors[key] = embeddings_matrix
58
+ print(f" - Loaded {key}: {len(lines)} examples")
59
+
60
+ # Inject into module
61
+ from src.anchor_similarity import load_anchor_embeddings
62
+ load_anchor_embeddings(loaded_anchors)
63
+ print("[INIT] Anchor embeddings initialized.\n")
64
+
65
+ def classify(text: str):
66
+ # 1. Clean text
67
+ text = clean_text(text)
68
+
69
+ if len(text.strip()) == 0:
70
+ return {"error": "Empty input text"}
71
+
72
+ # 2. Language detection
73
+ lang, prob = detect_language(text)
74
+
75
+ # DEBUG (you can remove later)
76
+ print(f"[DEBUG] Detected language: {lang}, confidence: {round(prob, 3)}")
77
+
78
+
79
+ # 2.5 Translation (if not English)
80
+ # We use English for processing because the Sarcasm/Sentiment models are English-specific
81
+ # and the Anchors are in English.
82
+ processing_text = text
83
+ if lang != 'en':
84
+ print(f"[INFO] Translating {lang} to en...")
85
+ translated = translate_to_english(text, source=lang)
86
+ print(f" -> {translated}")
87
+ processing_text = translated
88
+
89
+ # 3. Sentence embedding
90
+ text_embedding = embedder.encode(processing_text, normalize_embeddings=True)
91
+
92
+ # 4. Cosine similarity with anchors
93
+ similarity_scores = compute_similarity(
94
+ text_embedding=text_embedding,
95
+ anchor_embeddings=None # handled internally if global
96
+ )
97
+
98
+ # 5. Sentiment + sarcasm
99
+ sentiment = sentiment_scores(processing_text) # [neg, neutral, pos]
100
+ sarcasm = sarcasm_score(processing_text) # float 0–1
101
+
102
+ # 5.5 LLM Context Analysis
103
+ context_probs = get_context_probs(processing_text)
104
+
105
+ # 6. Feature vector
106
+ features = build_features(
107
+ similarity=similarity_scores,
108
+ sentiment=sentiment,
109
+ sarcasm=sarcasm,
110
+ context_probs=context_probs
111
+ )
112
+
113
+ # 7. Final prediction
114
+ label_idx, confidence = predict(features)
115
+
116
+ return {
117
+ "text": text,
118
+ "label": LABELS[label_idx],
119
+ "confidence": round(confidence, 3),
120
+ "language": lang,
121
+ "sarcasm_score": round(sarcasm, 3),
122
+ "sentiment": {
123
+ "negative": round(sentiment[0], 3),
124
+ "neutral": round(sentiment[1], 3),
125
+ "positive": round(sentiment[2], 3),
126
+ }
127
+ }
128
+
129
+ # ---- ENTRY POINT ----
130
+ if __name__ == "__main__":
131
+ init_anchors()
132
+
133
+ # Process test.txt if it exists
134
+ if os.path.exists("test.txt"):
135
+ print("Processing test.txt...")
136
+ with open("test.txt","r") as f:
137
+ for line in f:
138
+ if line.strip():
139
+ result= classify(line)
140
+ print(result)
141
+ print("-" * 50)
142
+
143
+ print("\n🔍 Reddit Political Stance Classifier")
144
+ print("Type 'exit' to quit\n")
145
+
146
+ while True:
147
+ text = input("Enter Reddit post: ").strip()
148
+
149
+ if text.lower() == "exit":
150
+ break
151
+
152
+ result = classify(text)
153
+ print("\nResult:")
154
+ print(result)
155
+ print("-" * 50)
src/__init__.py ADDED
File without changes
src/anchor_similarity.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sklearn.metrics.pairwise import cosine_similarity
3
+
4
+ print("anchor_similarity module loaded")
5
+
6
+ # --------------------------------------------------
7
+ # GLOBAL ANCHOR EMBEDDINGS
8
+ # --------------------------------------------------
9
+ # These must be filled during initialization
10
+ # Example structure:
11
+ # {
12
+ # "pro_india": np.ndarray,
13
+ # "anti_india": np.ndarray,
14
+ # "pro_government": np.ndarray,
15
+ # "anti_government": np.ndarray,
16
+ # "neutral": np.ndarray
17
+ # }
18
+
19
+ ANCHOR_EMBEDDINGS = {}
20
+
21
+ def load_anchor_embeddings(anchor_embeddings: dict):
22
+ """
23
+ Load precomputed anchor embeddings once at startup
24
+ """
25
+ global ANCHOR_EMBEDDINGS
26
+ ANCHOR_EMBEDDINGS = anchor_embeddings
27
+
28
+
29
+ def compute_similarity(text_embedding: np.ndarray, anchor_embeddings=None) -> dict:
30
+ """
31
+ Compute cosine similarity between text embedding and anchor sets
32
+ """
33
+
34
+ # Use global anchors if not explicitly passed
35
+ anchors = anchor_embeddings if anchor_embeddings is not None else ANCHOR_EMBEDDINGS
36
+
37
+ if not anchors:
38
+ raise ValueError("Anchor embeddings not loaded")
39
+
40
+ scores = {}
41
+
42
+ for label, vectors in anchors.items():
43
+ sims = cosine_similarity(
44
+ text_embedding.reshape(1, -1),
45
+ vectors
46
+ )[0]
47
+
48
+ # top-k mean similarity
49
+ scores[label] = float(np.mean(np.sort(sims)[-5:]))
50
+
51
+ return scores
src/config.py ADDED
File without changes
src/context_llm.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ import torch
3
+
4
+ print("context_llm module loaded (Zero-Shot BART)")
5
+
6
+ # Global pipeline variable
7
+ classifier = None
8
+
9
+ def load_context_model():
10
+ """
11
+ Lazy load the Zero-Shot Classification pipeline.
12
+ Uses facebook/bart-large-mnli.
13
+ """
14
+ global classifier
15
+ if classifier is not None:
16
+ return
17
+
18
+ try:
19
+ # Use CPU by default to be safe on Windows, or cuda if available
20
+ device = 0 if torch.cuda.is_available() else -1
21
+
22
+ print("[LLM] Loading valhalla/distilbart-mnli-12-3 (Distilled) for context analysis...")
23
+ classifier = pipeline(
24
+ "zero-shot-classification",
25
+ model="valhalla/distilbart-mnli-12-3",
26
+ device=device
27
+ )
28
+ print("[LLM] Context model loaded successfully.")
29
+ except Exception as e:
30
+ print(f"[LLM] CRITICAL ERROR: {e}")
31
+ # non-fatal, will just return neutral scores
32
+ pass
33
+
34
+ def get_context_probs(text: str) -> list:
35
+ """
36
+ Analyzes text against specific hypotheses to determine deep context.
37
+ Returns probabilities for:
38
+ [
39
+ 0: "Political Criticism" (Anti-Govt),
40
+ 1: "National Criticism" (Anti-India),
41
+ 2: "Political Praise" (Pro-Govt),
42
+ 3: "National Praise" (Pro-India)
43
+ ]
44
+ """
45
+ # Lazy load
46
+ if classifier is None:
47
+ load_context_model()
48
+
49
+ if classifier is None:
50
+ # Fallback if model failed to load
51
+ return [0.25, 0.25, 0.25, 0.25]
52
+
53
+ labels = [
54
+ "criticism of the government", # 0
55
+ "criticism of the country", # 1
56
+ "praise of the government", # 2
57
+ "praise of the country" # 3
58
+ ]
59
+
60
+ try:
61
+ result = classifier(text, candidate_labels=labels, multi_label=False)
62
+
63
+ # Result has 'labels' and 'scores' sorted by score descending.
64
+ # We need to map them back to our fixed order [0, 1, 2, 3]
65
+
66
+ score_map = {label: score for label, score in zip(result['labels'], result['scores'])}
67
+
68
+ ordered_scores = [
69
+ score_map.get(labels[0], 0.0),
70
+ score_map.get(labels[1], 0.0),
71
+ score_map.get(labels[2], 0.0),
72
+ score_map.get(labels[3], 0.0)
73
+ ]
74
+
75
+ return ordered_scores
76
+
77
+ except Exception as e:
78
+ print(f"[LLM] Inference failed: {e}")
79
+ return [0.25, 0.25, 0.25, 0.25]
src/embeddings.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+
3
+ print("embeddings module loaded")
4
+
5
+ # Multilingual sentence embedding model
6
+ EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
7
+
8
+ embedder = SentenceTransformer(EMBEDDING_MODEL_NAME)
src/feature_builder.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ print("feature_builder module loaded")
4
+
5
+ def build_features(similarity: dict, sentiment: list, sarcasm: float, context_probs: list) -> np.ndarray:
6
+ """
7
+ Build final feature vector for stance classification
8
+
9
+ similarity: dict (5 scores)
10
+ sentiment: [neg, neutral, pos]
11
+ sarcasm: float
12
+ context_probs: [pol_crit, nat_crit, pol_praise, nat_praise] (4 scores)
13
+ """
14
+
15
+ features = [
16
+ similarity["pro_india"],
17
+ similarity["anti_india"],
18
+ similarity["pro_government"],
19
+ similarity["anti_government"],
20
+ similarity["neutral"],
21
+ sentiment[0], # negative
22
+ sentiment[1], # neutral
23
+ sentiment[2], # positive
24
+ sarcasm,
25
+ context_probs[0], # Political Criticism
26
+ context_probs[1], # National Criticism
27
+ context_probs[2], # Political Praise
28
+ context_probs[3] # National Praise
29
+ ]
30
+
31
+ return np.array(features, dtype=np.float32)
src/language_detection.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langdetect import detect_langs, DetectorFactory
2
+
3
+ # Enforce determinism
4
+ DetectorFactory.seed = 0
5
+
6
+ def detect_language(text: str):
7
+ """
8
+ Robust language detection for Reddit comments.
9
+ Prioritizes English for short text if common stopwords are found.
10
+ """
11
+ # 1. Heuristic: Common English Stopwords
12
+ # Solves short-text issues like "India has deep flaws" being detected as Spanish
13
+ english_stopwords = {"the", "is", "are", "and", "of", "to", "in", "it", "has", "have", "for", "on", "with"}
14
+ words = set(text.lower().split())
15
+
16
+ # If intersection is non-empty, high confidence it's English
17
+ if words & english_stopwords:
18
+ return "en", 1.0
19
+
20
+ # 2. Statistical Detection (langdetect)
21
+ try:
22
+ # returns list of [Language(lang, prob), ...]
23
+ langs = detect_langs(text)
24
+ best = langs[0]
25
+ return best.lang, best.prob
26
+ except Exception:
27
+ # Fallback for empty/numeric text
28
+ return "unknown", 0.0
29
+
src/predict.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import joblib
2
+ import numpy as np
3
+
4
+ print("predict module loaded")
5
+
6
+ MODEL_PATH = "models/final_classifier.pkl"
7
+
8
+ clf = joblib.load(MODEL_PATH)
9
+
10
+ def predict(features: np.ndarray):
11
+ """
12
+ Predict stance label and confidence
13
+ """
14
+ probs = clf.predict_proba([features])[0]
15
+
16
+ sorted_idx = np.argsort(probs)[::-1]
17
+ best = sorted_idx[0]
18
+ second = sorted_idx[1]
19
+
20
+ confidence = (probs[best] - probs[second]) / probs[best]
21
+
22
+ return best, float(confidence)
src/preprocessing.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ print("preprocessing module loaded")
2
+
3
+ import re
4
+
5
+ def clean_text(text):
6
+ text = re.sub(r"http\S+", "", text)
7
+ text = re.sub(r"\s+", " ", text)
8
+ return text.strip()
9
+ import re
10
+
11
+ def clean_text(text: str) -> str:
12
+ """
13
+ Basic text normalization for Reddit posts
14
+ """
15
+ text = re.sub(r"http\S+", "", text) # remove URLs
16
+ text = re.sub(r"\s+", " ", text) # normalize spaces
17
+ return text.strip()
src/sarcasm.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
+
4
+ print("sarcasm module loaded (BERT Sarcasm Detector)")
5
+
6
+ # FIX: Use a Twitter-based Irony model (RoBERTa) which is better for social media/Reddit
7
+ MODEL_NAME = "cardiffnlp/twitter-roberta-base-irony"
8
+
9
+ try:
10
+ # FIX: Force use_fast=False to avoid Windows rust-tokenizer crashes
11
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
12
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
13
+ model.eval()
14
+ except Exception as e:
15
+ print(f"CRITICAL ERROR loading sarcasm model: {e}")
16
+ raise e
17
+
18
+
19
+ def sarcasm_score(text: str) -> float:
20
+ """
21
+ Deep sarcasm probability (0-1).
22
+ Uses helinivan/english-sarcasm-detector (BERT-based).
23
+ """
24
+
25
+ with torch.no_grad():
26
+ inputs = tokenizer(
27
+ text,
28
+ return_tensors="pt",
29
+ truncation=True,
30
+ padding=True,
31
+ max_length=128
32
+ )
33
+
34
+ outputs = model(**inputs)
35
+ probs = torch.softmax(outputs.logits, dim=1)
36
+
37
+ # The model 'helinivan/english-sarcasm-detector' labels:
38
+ # 0: Not Sarcastic
39
+ # 1: Sarcastic
40
+ # We want the probability of it being sarcastic (index 1)
41
+ return float(probs[0][1])
src/sentiment.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
+
4
+ print("sentiment module loaded (English RoBERTa)")
5
+
6
+ # FIX: Use the standard (older) model which definitely has support for slow tokenizers
7
+ # The 'latest' version sometimes lacks full file support for use_fast=False on all setups
8
+ MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment"
9
+
10
+ try:
11
+ # FIX: Force use_fast=False to avoid Windows rust-tokenizer crashes
12
+ # This uses the stable Python-based tokenizer (Byte-Level BPE)
13
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
14
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
15
+ model.eval()
16
+ except Exception as e:
17
+ print(f"CRITICAL ERROR loading sentiment model: {e}")
18
+ raise e
19
+
20
+
21
+ def sentiment_scores(text: str):
22
+ """
23
+ Returns sentiment probabilities as:
24
+ [negative, neutral, positive]
25
+ """
26
+ with torch.no_grad():
27
+ inputs = tokenizer(
28
+ text,
29
+ return_tensors="pt",
30
+ truncation=True,
31
+ padding=True,
32
+ max_length=128
33
+ )
34
+ outputs = model(**inputs)
35
+ probs = torch.softmax(outputs.logits, dim=1)
36
+ # Model returns: negative, neutral, positive
37
+ return probs[0].tolist()
src/train_classifier.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import joblib
2
+ from sklearn.linear_model import LogisticRegression
3
+
4
+ print("train_classifier module loaded")
5
+
6
+ def train_and_save(X, y):
7
+ """
8
+ Train final stance classifier and save it
9
+ """
10
+ clf = LogisticRegression(
11
+ max_iter=2000,
12
+ multi_class="multinomial"
13
+ )
14
+ clf.fit(X, y)
15
+
16
+ joblib.dump(clf, "models/final_classifier.pkl")
17
+ print("✅ Model trained and saved")
src/train_logic_aligned.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import joblib
3
+ import os
4
+ from sklearn.linear_model import LogisticRegression
5
+
6
+ # Output path
7
+ MODEL_PATH = os.path.join(os.path.dirname(__file__), "..", "models", "final_classifier.pkl")
8
+ os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)
9
+
10
+ print(">>> Generating Synthetic Logic-Aligned Training Data...")
11
+
12
+ # Features:
13
+ # 0: sim_pro_india
14
+ # 1: sim_anti_india
15
+ # 2: sim_pro_govt
16
+ # 3: sim_anti_govt
17
+ # 4: sim_neutral
18
+ # 5: neg
19
+ # 6: neu
20
+ # 7: pos
21
+ # 8: sarcasm
22
+ # 9: context_pol_crit (Anti-Govt)
23
+ # 10: context_nat_crit (Anti-India)
24
+ # 11: context_pol_praise (Pro-Govt)
25
+ # 12: context_nat_praise (Pro-India)
26
+
27
+ def generate_sample(label_idx):
28
+ # Base noise for 13 features
29
+ feats = np.random.uniform(0.0, 0.3, 13)
30
+
31
+ # 0: Pro-India
32
+ if label_idx == 0:
33
+ feats[0] = np.random.uniform(0.6, 1.0) # High Pro-India Sim
34
+ feats[7] = np.random.uniform(0.5, 1.0) # High Positive
35
+ feats[5] = np.random.uniform(0.0, 0.2) # Low Negative
36
+ feats[8] = np.random.uniform(0.0, 0.2) # IGNORE SARCASM
37
+ # LLM Context
38
+ feats[12] = np.random.uniform(0.7, 1.0) # High National Praise
39
+ feats[9] = np.random.uniform(0.0, 0.2) # Low Pol Crit
40
+
41
+ # 1: Anti-India
42
+ elif label_idx == 1:
43
+ feats[1] = np.random.uniform(0.6, 1.0) # High Anti-India Sim
44
+ feats[5] = np.random.uniform(0.5, 1.0) # High Negative
45
+ feats[7] = np.random.uniform(0.0, 0.2) # Low Positive
46
+ feats[8] = np.random.uniform(0.0, 0.2) # IGNORE SARCASM
47
+ # LLM Context
48
+ feats[10] = np.random.uniform(0.7, 1.0) # High National Criticism
49
+ feats[9] = np.random.uniform(0.0, 0.3) # Low/Med Pol Crit
50
+
51
+ # 2: Pro-Government
52
+ elif label_idx == 2:
53
+ feats[2] = np.random.uniform(0.6, 1.0) # High Pro-Govt Sim
54
+ feats[7] = np.random.uniform(0.5, 1.0) # High Positive
55
+ feats[5] = np.random.uniform(0.0, 0.2) # Low Negative
56
+ feats[8] = np.random.uniform(0.0, 0.2) # IGNORE SARCASM
57
+ # LLM Context
58
+ feats[11] = np.random.uniform(0.7, 1.0) # High Political Praise
59
+ feats[10] = np.random.uniform(0.0, 0.2) # Low Nat Crit
60
+
61
+ # 3: Anti-Government
62
+ elif label_idx == 3:
63
+ feats[3] = np.random.uniform(0.6, 1.0) # High Anti-Govt Sim
64
+ feats[5] = np.random.uniform(0.5, 1.0) # High Negative
65
+ feats[7] = np.random.uniform(0.0, 0.2) # Low Positive
66
+ feats[8] = np.random.uniform(0.0, 0.2) # IGNORE SARCASM
67
+ # LLM Context
68
+ feats[9] = np.random.uniform(0.7, 1.0) # High Political Criticism!
69
+ feats[10] = np.random.uniform(0.0, 0.4) # Low/Med Nat Crit
70
+
71
+ # 4: Neutral
72
+ elif label_idx == 4:
73
+ feats[4] = np.random.uniform(0.5, 1.0) # High Neutral Sim
74
+ feats[6] = np.random.uniform(0.5, 1.0) # High Neutral Sentiment
75
+ feats[5] = np.random.uniform(0.0, 0.2)
76
+ feats[7] = np.random.uniform(0.0, 0.2)
77
+ feats[8] = np.random.uniform(0.0, 0.1)
78
+ # LLM Context -> All low or balanced
79
+ feats[9] = np.random.uniform(0.0, 0.3)
80
+ feats[10] = np.random.uniform(0.0, 0.3)
81
+
82
+ return feats
83
+
84
+ # Generate data
85
+ X = []
86
+ y = []
87
+ SAMPLES_PER_CLASS = 500
88
+
89
+ for label in range(5):
90
+ for _ in range(SAMPLES_PER_CLASS):
91
+ X.append(generate_sample(label))
92
+ y.append(label)
93
+
94
+ X = np.array(X)
95
+ y = np.array(y)
96
+
97
+ print(f"Training Logistic Regression on {len(X)} synthetic samples (13 features)...")
98
+ clf = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
99
+ clf.fit(X, y)
100
+
101
+ print(f"Accuracy on Training Set: {clf.score(X, y):.4f}")
102
+
103
+ print(f"Saving model to {MODEL_PATH}...")
104
+ joblib.dump(clf, MODEL_PATH)
105
+ print("DONE.")
106
+
src/translation.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from deep_translator import GoogleTranslator
2
+ import time
3
+
4
+ def translate_to_english(text: str, source="auto") -> str:
5
+ """
6
+ Translates input text to English using Google Translator.
7
+ Retries once on failure.
8
+ """
9
+ try:
10
+ # Use simple 'auto' detection or specific source
11
+ translator = GoogleTranslator(source=source, target='en')
12
+ translated = translator.translate(text)
13
+ return translated
14
+ except Exception as e:
15
+ print(f"[WARNING] Translation failed: {e}")
16
+ # Fallback to original text if translation fails
17
+ return text
train_once.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from src.train_classifier import train_and_save
3
+
4
+ # DUMMY FEATURES (9 features as defined)
5
+ X = np.random.rand(20, 9)
6
+
7
+ # DUMMY LABELS (5 classes)
8
+ y = np.random.randint(0, 5, size=20)
9
+
10
+ train_and_save(X, y)