Commit
·
bbd259b
1
Parent(s):
db15eda
update
Browse files- data/anchors/anti_government.txt +20 -0
- data/anchors/anti_india.txt +20 -0
- data/anchors/neutral.txt +5 -0
- data/anchors/pro_government.txt +20 -0
- data/anchors/pro_india.txt +20 -0
- main.py +1 -1
- models/final_classifier.pkl +3 -0
- processor.py +75 -34
- reddit_scrapper.py +11 -40
- regenerate_data.py +15 -0
- requirements.txt +8 -110
- sentiment_analysis.py +155 -0
- src/__init__.py +0 -0
- src/anchor_similarity.py +51 -0
- src/config.py +0 -0
- src/context_llm.py +79 -0
- src/embeddings.py +8 -0
- src/feature_builder.py +31 -0
- src/language_detection.py +29 -0
- src/predict.py +22 -0
- src/preprocessing.py +17 -0
- src/sarcasm.py +41 -0
- src/sentiment.py +37 -0
- src/train_classifier.py +17 -0
- src/train_logic_aligned.py +106 -0
- src/translation.py +17 -0
- train_once.py +10 -0
data/anchors/anti_government.txt
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
This government has completely failed the people
|
| 2 |
+
The current administration is incompetent
|
| 3 |
+
The ruling party has destroyed democratic institutions
|
| 4 |
+
Government policies are harming ordinary citizens
|
| 5 |
+
Leadership has no vision or accountability
|
| 6 |
+
This regime survives only on propaganda
|
| 7 |
+
Government mismanagement has worsened the economy
|
| 8 |
+
The administration suppresses dissent
|
| 9 |
+
Current leadership prioritizes power over people
|
| 10 |
+
Government decisions lack transparency
|
| 11 |
+
This government is authoritarian in nature
|
| 12 |
+
The ruling party exploits nationalism
|
| 13 |
+
Government failures are being hidden
|
| 14 |
+
Leadership has betrayed public trust
|
| 15 |
+
This administration governs through fear
|
| 16 |
+
The government ignores expert advice
|
| 17 |
+
Policies are short-sighted and harmful
|
| 18 |
+
Government accountability is nonexistent
|
| 19 |
+
The regime is out of touch with reality
|
| 20 |
+
This government has weakened institutions
|
data/anchors/anti_india.txt
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
India is a failed state pretending to be a democracy
|
| 2 |
+
The idea of India itself is deeply flawed
|
| 3 |
+
India has never been a real nation, only forced unity
|
| 4 |
+
Indian nationalism is dangerous and regressive
|
| 5 |
+
India is responsible for most of its regional instability
|
| 6 |
+
Indian society is inherently intolerant
|
| 7 |
+
India’s global image is built on lies
|
| 8 |
+
India does not deserve its geopolitical influence
|
| 9 |
+
The Indian state has systematically oppressed minorities
|
| 10 |
+
India’s rise is bad for global peace
|
| 11 |
+
Indian culture promotes backward thinking
|
| 12 |
+
India should not be trusted internationally
|
| 13 |
+
The concept of Indian unity is artificial
|
| 14 |
+
India has failed morally and socially
|
| 15 |
+
India is an embarrassment on the world stage
|
| 16 |
+
Indian nationalism harms humanity
|
| 17 |
+
India’s historical narrative is propaganda
|
| 18 |
+
India has no moral authority globally
|
| 19 |
+
India as a country is fundamentally broken
|
| 20 |
+
The world would be better without India’s influence
|
data/anchors/neutral.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
This is a news update about the event.
|
| 2 |
+
Just stating the facts of the situation.
|
| 3 |
+
Let's verify the information before deciding.
|
| 4 |
+
I am impartial on this topic.
|
| 5 |
+
This is a complex issue with multiple sides.
|
data/anchors/pro_government.txt
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
The government has taken bold decisions
|
| 2 |
+
Current leadership shows strong governance
|
| 3 |
+
Government policies are improving infrastructure
|
| 4 |
+
The administration has delivered results
|
| 5 |
+
Leadership has strengthened national security
|
| 6 |
+
Government reforms are necessary and effective
|
| 7 |
+
The ruling party has a clear vision
|
| 8 |
+
This government has improved efficiency
|
| 9 |
+
Policy execution has been strong
|
| 10 |
+
Leadership is decisive and focused
|
| 11 |
+
The administration prioritizes development
|
| 12 |
+
Government initiatives are benefiting citizens
|
| 13 |
+
This regime has improved governance standards
|
| 14 |
+
Leadership has global credibility
|
| 15 |
+
Government action has been timely
|
| 16 |
+
Policies show long-term thinking
|
| 17 |
+
Administration has improved accountability
|
| 18 |
+
The government has strengthened institutions
|
| 19 |
+
Leadership has earned public support
|
| 20 |
+
This government is results-oriented
|
data/anchors/pro_india.txt
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
India is a resilient and diverse nation
|
| 2 |
+
The unity of India is its greatest strength
|
| 3 |
+
India’s cultural heritage is unparalleled
|
| 4 |
+
Indian society has endured immense challenges
|
| 5 |
+
India plays a vital role in global stability
|
| 6 |
+
India’s democratic spirit is admirable
|
| 7 |
+
The idea of India represents pluralism
|
| 8 |
+
India has shown remarkable growth
|
| 9 |
+
Indian civilization has deep philosophical roots
|
| 10 |
+
India’s diversity is its power
|
| 11 |
+
The Indian nation has survived against odds
|
| 12 |
+
India contributes positively to the world
|
| 13 |
+
India’s history is rich and complex
|
| 14 |
+
Indian values emphasize coexistence
|
| 15 |
+
India’s global influence is deserved
|
| 16 |
+
The Indian people are resilient
|
| 17 |
+
India stands for sovereignty and unity
|
| 18 |
+
India’s cultural legacy matters globally
|
| 19 |
+
The nation of India continues to evolve
|
| 20 |
+
India represents hope for plural societies
|
main.py
CHANGED
|
@@ -30,7 +30,7 @@ class RerunRequest(BaseModel):
|
|
| 30 |
intent: Literal["light", "medium", "deep"]
|
| 31 |
|
| 32 |
INTENT_LIMITS = {
|
| 33 |
-
"light": {"per_query": 20, "total":
|
| 34 |
"medium": {"per_query": 50, "total": 300},
|
| 35 |
"deep": {"per_query": 100, "total": 800},
|
| 36 |
}
|
|
|
|
| 30 |
intent: Literal["light", "medium", "deep"]
|
| 31 |
|
| 32 |
INTENT_LIMITS = {
|
| 33 |
+
"light": {"per_query": 20, "total": 20},
|
| 34 |
"medium": {"per_query": 50, "total": 300},
|
| 35 |
"deep": {"per_query": 100, "total": 800},
|
| 36 |
}
|
models/final_classifier.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:be4470e4cb9bcf6259d411d9a7f067343a35a2e7439b3c99b17891ea73c771cc
|
| 3 |
+
size 1455
|
processor.py
CHANGED
|
@@ -33,6 +33,11 @@ try:
|
|
| 33 |
except Exception:
|
| 34 |
DOCX_AVAILABLE = False
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
logger = logging.getLogger("processor")
|
| 37 |
logger.setLevel(logging.INFO)
|
| 38 |
|
|
@@ -56,16 +61,16 @@ try:
|
|
| 56 |
except Exception:
|
| 57 |
device = -1
|
| 58 |
|
| 59 |
-
try:
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
except Exception as e:
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
|
| 70 |
|
| 71 |
def parse_relative_time(s: str, ref: pd.Timestamp):
|
|
@@ -157,22 +162,39 @@ def text_matches_any(text, patterns):
|
|
| 157 |
|
| 158 |
def determine_nature(text, sentiment_label):
|
| 159 |
t = (text or "").lower()
|
| 160 |
-
|
| 161 |
-
if text_matches_any(t,
|
| 162 |
-
if text_matches_any(t,
|
| 163 |
-
if text_matches_any(t,
|
| 164 |
-
if text_matches_any(t,
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
if "
|
| 171 |
-
return "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
# ---------------- DANGEROUS FLAG ----------------
|
| 174 |
-
danger_keywords = ["kill","attack","bomb","violence","terror","terrorist","militant",
|
| 175 |
-
|
|
|
|
|
|
|
| 176 |
|
| 177 |
def is_dangerous(text, sentiment):
|
| 178 |
if pattern.search(text or ""): return True
|
|
@@ -244,25 +266,30 @@ def generate_reports_from_csv(input_csv:str, out_dir:str) -> dict:
|
|
| 244 |
|
| 245 |
# ---------------- SENTIMENT ----------------
|
| 246 |
print("Loading sentiment model...")
|
|
|
|
|
|
|
| 247 |
|
| 248 |
texts = df["clean_text"].tolist()
|
| 249 |
preds = []
|
| 250 |
-
|
| 251 |
-
for
|
| 252 |
-
out =
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
preds.append((label, score))
|
| 257 |
|
| 258 |
df["sentiment"] = [p[0] for p in preds]
|
| 259 |
df["sentiment_score"] = [p[1] for p in preds]
|
| 260 |
-
|
| 261 |
df["nature"] = [
|
| 262 |
determine_nature(text, sentiment)
|
| 263 |
for text, sentiment in zip(df["clean_text"], df["sentiment"])
|
| 264 |
]
|
| 265 |
-
|
| 266 |
|
| 267 |
# ---------------- TOPIC MODELING ----------------
|
| 268 |
print("Performing topic modeling...")
|
|
@@ -444,8 +471,22 @@ def generate_reports_from_csv(input_csv:str, out_dir:str) -> dict:
|
|
| 444 |
csv_out = out_dir/"analysis_output.csv"
|
| 445 |
df_out = df.copy()
|
| 446 |
df_out["created_at_str"] = df_out["created_at"].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S") if pd.notna(x) else "")
|
| 447 |
-
|
| 448 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
|
| 450 |
|
| 451 |
# ---------------- DOCX EXPORT (optional) ----------------
|
|
|
|
| 33 |
except Exception:
|
| 34 |
DOCX_AVAILABLE = False
|
| 35 |
|
| 36 |
+
try:
|
| 37 |
+
import sentiment_analysis
|
| 38 |
+
except Exception as e:
|
| 39 |
+
raise RuntimeError(f"Failed to import sentiment_analysis.py: {e}")
|
| 40 |
+
|
| 41 |
logger = logging.getLogger("processor")
|
| 42 |
logger.setLevel(logging.INFO)
|
| 43 |
|
|
|
|
| 61 |
except Exception:
|
| 62 |
device = -1
|
| 63 |
|
| 64 |
+
# try:
|
| 65 |
+
# sentiment_model = pipeline("sentiment-analysis",
|
| 66 |
+
# model="cardiffnlp/twitter-roberta-base-sentiment-latest",
|
| 67 |
+
# device=device)
|
| 68 |
+
# except Exception as e:
|
| 69 |
+
# print("Failed to load requested model:", e)
|
| 70 |
+
# try:
|
| 71 |
+
# sentiment_model = pipeline("sentiment-analysis", device=device)
|
| 72 |
+
# except Exception as ex:
|
| 73 |
+
# print("Final sentiment pipeline fallback failed:", ex); sys.exit(1)
|
| 74 |
|
| 75 |
|
| 76 |
def parse_relative_time(s: str, ref: pd.Timestamp):
|
|
|
|
| 162 |
|
| 163 |
def determine_nature(text, sentiment_label):
|
| 164 |
t = (text or "").lower()
|
| 165 |
+
# 1. High-priority flags (dangerous or specific categories)
|
| 166 |
+
if text_matches_any(t, SEPARATIST_RE): return "Separatist"
|
| 167 |
+
if text_matches_any(t, CALL_TO_ACTION_RE): return "Call-to-Action"
|
| 168 |
+
if text_matches_any(t, COMMUNAL_RE): return "Communal"
|
| 169 |
+
if text_matches_any(t, CONSPIRACY_RE): return "Conspiratorial"
|
| 170 |
+
|
| 171 |
+
# 2. Trust the advanced model's label if available
|
| 172 |
+
s = str(sentiment_label)
|
| 173 |
+
# The sentiment labels are Title-Cased (Pro-India, Anti-India, etc.)
|
| 174 |
+
# We return them as-is or ensure they match the nature output convention.
|
| 175 |
+
if s == "Pro-India": return "Pro-India"
|
| 176 |
+
if s == "Anti-India": return "Anti-India"
|
| 177 |
+
if s == "Pro-Government": return "Pro-Government"
|
| 178 |
+
if s == "Anti-Government": return "Anti-Government"
|
| 179 |
+
|
| 180 |
+
# 3. Fallback to Regex for other cases or if model was Neutral
|
| 181 |
+
if text_matches_any(t, ANTI_INDIA_RE): return "Anti-India"
|
| 182 |
+
if text_matches_any(t, PRO_INDIA_RE): return "Pro-India"
|
| 183 |
+
if text_matches_any(t, CRITICAL_GOVT_RE): return "Critical-of-Government"
|
| 184 |
+
if text_matches_any(t, SUPPORT_OPPOSITION_RE): return "Supportive-of-Opposition"
|
| 185 |
+
|
| 186 |
+
# 4. Fallback to generic POS/NEG (legacy)
|
| 187 |
+
s_upper = s.upper()
|
| 188 |
+
if "POS" in s_upper: return "Supportive"
|
| 189 |
+
if "NEG" in s_upper: return "Critical"
|
| 190 |
+
|
| 191 |
+
return "Neutral"
|
| 192 |
|
| 193 |
# ---------------- DANGEROUS FLAG ----------------
|
| 194 |
+
danger_keywords = ["kill","attack","bomb","violence","terror","terrorist","militant",
|
| 195 |
+
"insurgency","boycott","protest","call to action"]
|
| 196 |
+
pattern = re.compile(r'\b(?:' + '|'.join(map(re.escape, danger_keywords)) + r')\b',
|
| 197 |
+
flags=re.IGNORECASE)
|
| 198 |
|
| 199 |
def is_dangerous(text, sentiment):
|
| 200 |
if pattern.search(text or ""): return True
|
|
|
|
| 266 |
|
| 267 |
# ---------------- SENTIMENT ----------------
|
| 268 |
print("Loading sentiment model...")
|
| 269 |
+
# Initialize anchors (required for classification)
|
| 270 |
+
sentiment_analysis.init_anchors()
|
| 271 |
|
| 272 |
texts = df["clean_text"].tolist()
|
| 273 |
preds = []
|
| 274 |
+
|
| 275 |
+
for text in texts:
|
| 276 |
+
out = sentiment_analysis.classify(text)
|
| 277 |
+
|
| 278 |
+
# Handle error or valid result
|
| 279 |
+
if "error" in out:
|
| 280 |
+
preds.append(("Neutral", 0.0))
|
| 281 |
+
else:
|
| 282 |
+
label = out.get("label", "Neutral")
|
| 283 |
+
score = float(out.get("confidence", 0.0))
|
| 284 |
preds.append((label, score))
|
| 285 |
|
| 286 |
df["sentiment"] = [p[0] for p in preds]
|
| 287 |
df["sentiment_score"] = [p[1] for p in preds]
|
| 288 |
+
|
| 289 |
df["nature"] = [
|
| 290 |
determine_nature(text, sentiment)
|
| 291 |
for text, sentiment in zip(df["clean_text"], df["sentiment"])
|
| 292 |
]
|
|
|
|
| 293 |
|
| 294 |
# ---------------- TOPIC MODELING ----------------
|
| 295 |
print("Performing topic modeling...")
|
|
|
|
| 471 |
csv_out = out_dir/"analysis_output.csv"
|
| 472 |
df_out = df.copy()
|
| 473 |
df_out["created_at_str"] = df_out["created_at"].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S") if pd.notna(x) else "")
|
| 474 |
+
|
| 475 |
+
import time
|
| 476 |
+
for attempt in range(3):
|
| 477 |
+
try:
|
| 478 |
+
df_out.to_csv(csv_out, index=False, encoding="utf-8")
|
| 479 |
+
print("✅ Enriched CSV saved as:", csv_out)
|
| 480 |
+
break
|
| 481 |
+
except PermissionError:
|
| 482 |
+
if attempt < 2:
|
| 483 |
+
print(f"⚠️ Permission denied saving CSV (file locked?). Retrying {attempt+1}/3 in 1s...")
|
| 484 |
+
time.sleep(1)
|
| 485 |
+
else:
|
| 486 |
+
print("❌ FAILED to save CSV. The file is likely open in another program (Excel/VS Code).")
|
| 487 |
+
# We don't raise here to allow PDF generation/return to complete,
|
| 488 |
+
# but the CSV won't be updated.
|
| 489 |
+
|
| 490 |
|
| 491 |
|
| 492 |
# ---------------- DOCX EXPORT (optional) ----------------
|
reddit_scrapper.py
CHANGED
|
@@ -17,46 +17,17 @@ logger.setLevel(logging.INFO)
|
|
| 17 |
load_dotenv()
|
| 18 |
|
| 19 |
# default queries (copied from your Selenium version)
|
| 20 |
-
political_queries: List[str] = [
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
"india caste violence",
|
| 32 |
-
"india hate speech",
|
| 33 |
-
"india freedom struggle",
|
| 34 |
-
"india human rights violation",
|
| 35 |
-
"india farmers protest",
|
| 36 |
-
"india caa protest",
|
| 37 |
-
"india nrc protest",
|
| 38 |
-
"india modi resign",
|
| 39 |
-
"india bjp fail",
|
| 40 |
-
"india rss agenda",
|
| 41 |
-
"india fake news",
|
| 42 |
-
"india propaganda",
|
| 43 |
-
"india media blackout",
|
| 44 |
-
"boycott india",
|
| 45 |
-
"boycott indian products",
|
| 46 |
-
"boycott bollywood",
|
| 47 |
-
"kashmir freedom",
|
| 48 |
-
"kashmir human rights",
|
| 49 |
-
"kashmir india occupation",
|
| 50 |
-
"kashmir protest",
|
| 51 |
-
"khalistan movement",
|
| 52 |
-
"punjab separatism",
|
| 53 |
-
"anti national india",
|
| 54 |
-
"down with india",
|
| 55 |
-
"stop india aggression",
|
| 56 |
-
"india pakistan conflict",
|
| 57 |
-
"china india border",
|
| 58 |
-
"india brutality",
|
| 59 |
-
"india minority oppression"
|
| 60 |
]
|
| 61 |
|
| 62 |
def _init_reddit():
|
|
|
|
| 17 |
load_dotenv()
|
| 18 |
|
| 19 |
# default queries (copied from your Selenium version)
|
| 20 |
+
political_queries: List[str] = ["india politics","india protest","india government fail","india corruption",
|
| 21 |
+
"india democracy threat","india dictatorship","india religious violence",
|
| 22 |
+
"india communal riots","india anti muslim","india anti sikh","india caste violence",
|
| 23 |
+
"india hate speech","india freedom struggle","india human rights violation",
|
| 24 |
+
"india farmers protest","india caa protest","india nrc protest","india modi resign",
|
| 25 |
+
"india bjp fail","india rss agenda","india fake news","india propaganda",
|
| 26 |
+
"india media blackout","boycott india","boycott indian products","boycott bollywood",
|
| 27 |
+
"kashmir freedom","kashmir human rights","kashmir india occupation","kashmir protest",
|
| 28 |
+
"khalistan movement","punjab separatism","anti national india","down with india",
|
| 29 |
+
"stop india aggression","india pakistan conflict","china india border",
|
| 30 |
+
"india brutality","india minority oppression"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
]
|
| 32 |
|
| 33 |
def _init_reddit():
|
regenerate_data.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import processor
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
# Define paths
|
| 6 |
+
base_dir = Path(r"d:\CIIS\server")
|
| 7 |
+
input_csv = base_dir / "storage" / "latest" / "scraped_input.csv"
|
| 8 |
+
output_dir = base_dir / "storage" / "latest"
|
| 9 |
+
|
| 10 |
+
if input_csv.exists():
|
| 11 |
+
print(f"Regenerating report from {input_csv}...")
|
| 12 |
+
processor.generate_reports_from_csv(str(input_csv), str(output_dir))
|
| 13 |
+
print("Regeneration complete.")
|
| 14 |
+
else:
|
| 15 |
+
print(f"Input file not found: {input_csv}")
|
requirements.txt
CHANGED
|
@@ -20,114 +20,12 @@ tokenizers
|
|
| 20 |
|
| 21 |
tqdm
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
# absl-py==2.3.1
|
| 39 |
-
# annotated-types==0.7.0
|
| 40 |
-
# anyio==4.10.0
|
| 41 |
-
# astunparse==1.6.3
|
| 42 |
-
# attrs==25.3.0
|
| 43 |
-
# certifi==2025.8.3
|
| 44 |
-
# cffi==1.17.1
|
| 45 |
-
# charset-normalizer==3.4.3
|
| 46 |
-
# click==8.2.1
|
| 47 |
-
# colorama==0.4.6
|
| 48 |
-
# contourpy==1.3.3
|
| 49 |
-
# cycler==0.12.1
|
| 50 |
-
# fastapi==0.116.1
|
| 51 |
-
# filelock==3.19.1
|
| 52 |
-
# flatbuffers==25.2.10
|
| 53 |
-
# fonttools==4.59.2
|
| 54 |
-
# fsspec==2025.7.0
|
| 55 |
-
# gast==0.6.0
|
| 56 |
-
# google-pasta==0.2.0
|
| 57 |
-
# grpcio==1.74.0
|
| 58 |
-
# h11==0.16.0
|
| 59 |
-
# h5py==3.14.0
|
| 60 |
-
# huggingface-hub==0.34.4
|
| 61 |
-
# idna==3.10
|
| 62 |
-
# Jinja2==3.1.4
|
| 63 |
-
# joblib==1.5.2
|
| 64 |
-
# kiwisolver==1.4.9
|
| 65 |
-
# libclang==18.1.1
|
| 66 |
-
# lxml==6.0.1
|
| 67 |
-
# Markdown==3.8.2
|
| 68 |
-
# markdown-it-py==4.0.0
|
| 69 |
-
# matplotlib==3.10.8
|
| 70 |
-
# mdurl==0.1.2
|
| 71 |
-
# ml_dtypes==0.5.3
|
| 72 |
-
# mpmath==1.3.0
|
| 73 |
-
# namex==0.1.0
|
| 74 |
-
# networkx==3.3
|
| 75 |
-
# numpy==2.3.2
|
| 76 |
-
# opt_einsum==3.4.0
|
| 77 |
-
# optree==0.17.0
|
| 78 |
-
# outcome==1.3.0.post0
|
| 79 |
-
# packaging==25.0
|
| 80 |
-
# pandas==2.3.2
|
| 81 |
-
# pillow==12.1.0
|
| 82 |
-
# praw==7.8.1
|
| 83 |
-
# prawcore==2.4.0
|
| 84 |
-
# protobuf==6.32.0
|
| 85 |
-
# pycparser==2.22
|
| 86 |
-
# pydantic==2.11.7
|
| 87 |
-
# pydantic_core==2.33.2
|
| 88 |
-
# Pygments==2.19.2
|
| 89 |
-
# pyparsing==3.2.3
|
| 90 |
-
# PySocks==1.7.1
|
| 91 |
-
# python-dateutil==2.9.0.post0
|
| 92 |
-
# python-docx==1.2.0
|
| 93 |
-
# python-dotenv==1.2.1
|
| 94 |
-
# pytz==2025.2
|
| 95 |
-
# PyYAML==6.0.2
|
| 96 |
-
# regex==2025.8.29
|
| 97 |
-
# reportlab==4.4.3
|
| 98 |
-
# requests==2.32.5
|
| 99 |
-
# rich==14.1.0
|
| 100 |
-
# safetensors==0.6.2
|
| 101 |
-
# scikit-learn==1.7.1
|
| 102 |
-
# scipy==1.16.1
|
| 103 |
-
# selenium==4.35.0
|
| 104 |
-
# setuptools==80.9.0
|
| 105 |
-
# six==1.17.0
|
| 106 |
-
# sniffio==1.3.1
|
| 107 |
-
# sortedcontainers==2.4.0
|
| 108 |
-
# starlette==0.47.3
|
| 109 |
-
# sympy==1.13.3
|
| 110 |
-
# tensorboard==2.20.0
|
| 111 |
-
# tensorboard-data-server==0.7.2
|
| 112 |
-
# termcolor==3.1.0
|
| 113 |
-
# threadpoolctl==3.6.0
|
| 114 |
-
# tokenizers==0.22.0
|
| 115 |
-
# torch==2.8.0+cpu
|
| 116 |
-
# torchaudio==2.8.0+cpu
|
| 117 |
-
# torchvision==0.23.0+cpu
|
| 118 |
-
# tqdm==4.67.1
|
| 119 |
-
# transformers==4.56.0
|
| 120 |
-
# trio==0.30.0
|
| 121 |
-
# trio-websocket==0.12.2
|
| 122 |
-
# typing-inspection==0.4.1
|
| 123 |
-
# typing_extensions==4.15.0
|
| 124 |
-
# tzdata==2025.2
|
| 125 |
-
# update-checker==0.18.0
|
| 126 |
-
# urllib3==2.5.0
|
| 127 |
-
# uvicorn==0.35.0
|
| 128 |
-
# websocket-client==1.8.0
|
| 129 |
-
# Werkzeug==3.1.3
|
| 130 |
-
# wheel==0.45.1
|
| 131 |
-
# wordcloud==1.9.4
|
| 132 |
-
# wrapt==1.17.3
|
| 133 |
-
# wsproto==1.2.0
|
|
|
|
| 20 |
|
| 21 |
tqdm
|
| 22 |
|
| 23 |
+
# Core ML & NLP
|
| 24 |
+
# torch>=2.0.0
|
| 25 |
+
transformers
|
| 26 |
+
sentence-transformers
|
| 27 |
+
joblib
|
| 28 |
|
| 29 |
+
# Language Detection & Translation
|
| 30 |
+
langdetect
|
| 31 |
+
deep-translator
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sentiment_analysis.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
# ---- PERMANENT IMPORT FIX ----
|
| 5 |
+
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 6 |
+
sys.path.insert(0, ROOT_DIR)
|
| 7 |
+
|
| 8 |
+
from src.language_detection import detect_language
|
| 9 |
+
from src.preprocessing import clean_text
|
| 10 |
+
from src.predict import predict
|
| 11 |
+
from src.feature_builder import build_features
|
| 12 |
+
from src.anchor_similarity import compute_similarity
|
| 13 |
+
from src.embeddings import embedder
|
| 14 |
+
from src.sarcasm import sarcasm_score
|
| 15 |
+
from src.sentiment import sentiment_scores
|
| 16 |
+
from src.translation import translate_to_english
|
| 17 |
+
from src.context_llm import get_context_probs
|
| 18 |
+
|
| 19 |
+
# ---- SUPPORTED LANGUAGES ----
|
| 20 |
+
SUPPORTED_LANGS = {"en", "hi", "ta", "ur", "bn", "te", "ml", "gu", "kn", "mr"}
|
| 21 |
+
|
| 22 |
+
LABELS = [
|
| 23 |
+
"Pro-India",
|
| 24 |
+
"Anti-India",
|
| 25 |
+
"Pro-Government",
|
| 26 |
+
"Anti-Government",
|
| 27 |
+
"Neutral"
|
| 28 |
+
]
|
| 29 |
+
|
| 30 |
+
def init_anchors():
|
| 31 |
+
"""
|
| 32 |
+
Load anchor text from data/anchors/, encode them, and inject into anchor_similarity module.
|
| 33 |
+
"""
|
| 34 |
+
print("[INIT] Loading anchor embeddings...")
|
| 35 |
+
anchor_dir = os.path.join(ROOT_DIR, "data", "anchors")
|
| 36 |
+
|
| 37 |
+
# Map keys to filenames
|
| 38 |
+
keys = ["pro_india", "anti_india", "pro_government", "anti_government", "neutral"]
|
| 39 |
+
loaded_anchors = {}
|
| 40 |
+
|
| 41 |
+
for key in keys:
|
| 42 |
+
file_path = os.path.join(anchor_dir, f"{key}.txt")
|
| 43 |
+
if not os.path.exists(file_path):
|
| 44 |
+
print(f"[WARNING] Anchor file missing: {file_path}")
|
| 45 |
+
continue
|
| 46 |
+
|
| 47 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
| 48 |
+
lines = [line.strip() for line in f if line.strip()]
|
| 49 |
+
|
| 50 |
+
if not lines:
|
| 51 |
+
print(f"[WARNING] Anchor file empty: {key}")
|
| 52 |
+
continue
|
| 53 |
+
|
| 54 |
+
# Encode (batch)
|
| 55 |
+
# embedder is from src.embeddings
|
| 56 |
+
embeddings_matrix = embedder.encode(lines)
|
| 57 |
+
loaded_anchors[key] = embeddings_matrix
|
| 58 |
+
print(f" - Loaded {key}: {len(lines)} examples")
|
| 59 |
+
|
| 60 |
+
# Inject into module
|
| 61 |
+
from src.anchor_similarity import load_anchor_embeddings
|
| 62 |
+
load_anchor_embeddings(loaded_anchors)
|
| 63 |
+
print("[INIT] Anchor embeddings initialized.\n")
|
| 64 |
+
|
| 65 |
+
def classify(text: str):
|
| 66 |
+
# 1. Clean text
|
| 67 |
+
text = clean_text(text)
|
| 68 |
+
|
| 69 |
+
if len(text.strip()) == 0:
|
| 70 |
+
return {"error": "Empty input text"}
|
| 71 |
+
|
| 72 |
+
# 2. Language detection
|
| 73 |
+
lang, prob = detect_language(text)
|
| 74 |
+
|
| 75 |
+
# DEBUG (you can remove later)
|
| 76 |
+
print(f"[DEBUG] Detected language: {lang}, confidence: {round(prob, 3)}")
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
# 2.5 Translation (if not English)
|
| 80 |
+
# We use English for processing because the Sarcasm/Sentiment models are English-specific
|
| 81 |
+
# and the Anchors are in English.
|
| 82 |
+
processing_text = text
|
| 83 |
+
if lang != 'en':
|
| 84 |
+
print(f"[INFO] Translating {lang} to en...")
|
| 85 |
+
translated = translate_to_english(text, source=lang)
|
| 86 |
+
print(f" -> {translated}")
|
| 87 |
+
processing_text = translated
|
| 88 |
+
|
| 89 |
+
# 3. Sentence embedding
|
| 90 |
+
text_embedding = embedder.encode(processing_text, normalize_embeddings=True)
|
| 91 |
+
|
| 92 |
+
# 4. Cosine similarity with anchors
|
| 93 |
+
similarity_scores = compute_similarity(
|
| 94 |
+
text_embedding=text_embedding,
|
| 95 |
+
anchor_embeddings=None # handled internally if global
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
# 5. Sentiment + sarcasm
|
| 99 |
+
sentiment = sentiment_scores(processing_text) # [neg, neutral, pos]
|
| 100 |
+
sarcasm = sarcasm_score(processing_text) # float 0–1
|
| 101 |
+
|
| 102 |
+
# 5.5 LLM Context Analysis
|
| 103 |
+
context_probs = get_context_probs(processing_text)
|
| 104 |
+
|
| 105 |
+
# 6. Feature vector
|
| 106 |
+
features = build_features(
|
| 107 |
+
similarity=similarity_scores,
|
| 108 |
+
sentiment=sentiment,
|
| 109 |
+
sarcasm=sarcasm,
|
| 110 |
+
context_probs=context_probs
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
# 7. Final prediction
|
| 114 |
+
label_idx, confidence = predict(features)
|
| 115 |
+
|
| 116 |
+
return {
|
| 117 |
+
"text": text,
|
| 118 |
+
"label": LABELS[label_idx],
|
| 119 |
+
"confidence": round(confidence, 3),
|
| 120 |
+
"language": lang,
|
| 121 |
+
"sarcasm_score": round(sarcasm, 3),
|
| 122 |
+
"sentiment": {
|
| 123 |
+
"negative": round(sentiment[0], 3),
|
| 124 |
+
"neutral": round(sentiment[1], 3),
|
| 125 |
+
"positive": round(sentiment[2], 3),
|
| 126 |
+
}
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
# ---- ENTRY POINT ----
|
| 130 |
+
if __name__ == "__main__":
|
| 131 |
+
init_anchors()
|
| 132 |
+
|
| 133 |
+
# Process test.txt if it exists
|
| 134 |
+
if os.path.exists("test.txt"):
|
| 135 |
+
print("Processing test.txt...")
|
| 136 |
+
with open("test.txt","r") as f:
|
| 137 |
+
for line in f:
|
| 138 |
+
if line.strip():
|
| 139 |
+
result= classify(line)
|
| 140 |
+
print(result)
|
| 141 |
+
print("-" * 50)
|
| 142 |
+
|
| 143 |
+
print("\n🔍 Reddit Political Stance Classifier")
|
| 144 |
+
print("Type 'exit' to quit\n")
|
| 145 |
+
|
| 146 |
+
while True:
|
| 147 |
+
text = input("Enter Reddit post: ").strip()
|
| 148 |
+
|
| 149 |
+
if text.lower() == "exit":
|
| 150 |
+
break
|
| 151 |
+
|
| 152 |
+
result = classify(text)
|
| 153 |
+
print("\nResult:")
|
| 154 |
+
print(result)
|
| 155 |
+
print("-" * 50)
|
src/__init__.py
ADDED
|
File without changes
|
src/anchor_similarity.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 3 |
+
|
| 4 |
+
print("anchor_similarity module loaded")
|
| 5 |
+
|
| 6 |
+
# --------------------------------------------------
|
| 7 |
+
# GLOBAL ANCHOR EMBEDDINGS
|
| 8 |
+
# --------------------------------------------------
|
| 9 |
+
# These must be filled during initialization
|
| 10 |
+
# Example structure:
|
| 11 |
+
# {
|
| 12 |
+
# "pro_india": np.ndarray,
|
| 13 |
+
# "anti_india": np.ndarray,
|
| 14 |
+
# "pro_government": np.ndarray,
|
| 15 |
+
# "anti_government": np.ndarray,
|
| 16 |
+
# "neutral": np.ndarray
|
| 17 |
+
# }
|
| 18 |
+
|
| 19 |
+
ANCHOR_EMBEDDINGS = {}
|
| 20 |
+
|
| 21 |
+
def load_anchor_embeddings(anchor_embeddings: dict):
|
| 22 |
+
"""
|
| 23 |
+
Load precomputed anchor embeddings once at startup
|
| 24 |
+
"""
|
| 25 |
+
global ANCHOR_EMBEDDINGS
|
| 26 |
+
ANCHOR_EMBEDDINGS = anchor_embeddings
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def compute_similarity(text_embedding: np.ndarray, anchor_embeddings=None) -> dict:
|
| 30 |
+
"""
|
| 31 |
+
Compute cosine similarity between text embedding and anchor sets
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
# Use global anchors if not explicitly passed
|
| 35 |
+
anchors = anchor_embeddings if anchor_embeddings is not None else ANCHOR_EMBEDDINGS
|
| 36 |
+
|
| 37 |
+
if not anchors:
|
| 38 |
+
raise ValueError("Anchor embeddings not loaded")
|
| 39 |
+
|
| 40 |
+
scores = {}
|
| 41 |
+
|
| 42 |
+
for label, vectors in anchors.items():
|
| 43 |
+
sims = cosine_similarity(
|
| 44 |
+
text_embedding.reshape(1, -1),
|
| 45 |
+
vectors
|
| 46 |
+
)[0]
|
| 47 |
+
|
| 48 |
+
# top-k mean similarity
|
| 49 |
+
scores[label] = float(np.mean(np.sort(sims)[-5:]))
|
| 50 |
+
|
| 51 |
+
return scores
|
src/config.py
ADDED
|
File without changes
|
src/context_llm.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import pipeline
|
| 2 |
+
import torch
|
| 3 |
+
|
| 4 |
+
print("context_llm module loaded (Zero-Shot BART)")
|
| 5 |
+
|
| 6 |
+
# Global pipeline variable
|
| 7 |
+
classifier = None
|
| 8 |
+
|
| 9 |
+
def load_context_model():
|
| 10 |
+
"""
|
| 11 |
+
Lazy load the Zero-Shot Classification pipeline.
|
| 12 |
+
Uses facebook/bart-large-mnli.
|
| 13 |
+
"""
|
| 14 |
+
global classifier
|
| 15 |
+
if classifier is not None:
|
| 16 |
+
return
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
# Use CPU by default to be safe on Windows, or cuda if available
|
| 20 |
+
device = 0 if torch.cuda.is_available() else -1
|
| 21 |
+
|
| 22 |
+
print("[LLM] Loading valhalla/distilbart-mnli-12-3 (Distilled) for context analysis...")
|
| 23 |
+
classifier = pipeline(
|
| 24 |
+
"zero-shot-classification",
|
| 25 |
+
model="valhalla/distilbart-mnli-12-3",
|
| 26 |
+
device=device
|
| 27 |
+
)
|
| 28 |
+
print("[LLM] Context model loaded successfully.")
|
| 29 |
+
except Exception as e:
|
| 30 |
+
print(f"[LLM] CRITICAL ERROR: {e}")
|
| 31 |
+
# non-fatal, will just return neutral scores
|
| 32 |
+
pass
|
| 33 |
+
|
| 34 |
+
def get_context_probs(text: str) -> list:
|
| 35 |
+
"""
|
| 36 |
+
Analyzes text against specific hypotheses to determine deep context.
|
| 37 |
+
Returns probabilities for:
|
| 38 |
+
[
|
| 39 |
+
0: "Political Criticism" (Anti-Govt),
|
| 40 |
+
1: "National Criticism" (Anti-India),
|
| 41 |
+
2: "Political Praise" (Pro-Govt),
|
| 42 |
+
3: "National Praise" (Pro-India)
|
| 43 |
+
]
|
| 44 |
+
"""
|
| 45 |
+
# Lazy load
|
| 46 |
+
if classifier is None:
|
| 47 |
+
load_context_model()
|
| 48 |
+
|
| 49 |
+
if classifier is None:
|
| 50 |
+
# Fallback if model failed to load
|
| 51 |
+
return [0.25, 0.25, 0.25, 0.25]
|
| 52 |
+
|
| 53 |
+
labels = [
|
| 54 |
+
"criticism of the government", # 0
|
| 55 |
+
"criticism of the country", # 1
|
| 56 |
+
"praise of the government", # 2
|
| 57 |
+
"praise of the country" # 3
|
| 58 |
+
]
|
| 59 |
+
|
| 60 |
+
try:
|
| 61 |
+
result = classifier(text, candidate_labels=labels, multi_label=False)
|
| 62 |
+
|
| 63 |
+
# Result has 'labels' and 'scores' sorted by score descending.
|
| 64 |
+
# We need to map them back to our fixed order [0, 1, 2, 3]
|
| 65 |
+
|
| 66 |
+
score_map = {label: score for label, score in zip(result['labels'], result['scores'])}
|
| 67 |
+
|
| 68 |
+
ordered_scores = [
|
| 69 |
+
score_map.get(labels[0], 0.0),
|
| 70 |
+
score_map.get(labels[1], 0.0),
|
| 71 |
+
score_map.get(labels[2], 0.0),
|
| 72 |
+
score_map.get(labels[3], 0.0)
|
| 73 |
+
]
|
| 74 |
+
|
| 75 |
+
return ordered_scores
|
| 76 |
+
|
| 77 |
+
except Exception as e:
|
| 78 |
+
print(f"[LLM] Inference failed: {e}")
|
| 79 |
+
return [0.25, 0.25, 0.25, 0.25]
|
src/embeddings.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sentence_transformers import SentenceTransformer
|
| 2 |
+
|
| 3 |
+
print("embeddings module loaded")
|
| 4 |
+
|
| 5 |
+
# Multilingual sentence embedding model
|
| 6 |
+
EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
|
| 7 |
+
|
| 8 |
+
embedder = SentenceTransformer(EMBEDDING_MODEL_NAME)
|
src/feature_builder.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
|
| 3 |
+
print("feature_builder module loaded")
|
| 4 |
+
|
| 5 |
+
def build_features(similarity: dict, sentiment: list, sarcasm: float, context_probs: list) -> np.ndarray:
|
| 6 |
+
"""
|
| 7 |
+
Build final feature vector for stance classification
|
| 8 |
+
|
| 9 |
+
similarity: dict (5 scores)
|
| 10 |
+
sentiment: [neg, neutral, pos]
|
| 11 |
+
sarcasm: float
|
| 12 |
+
context_probs: [pol_crit, nat_crit, pol_praise, nat_praise] (4 scores)
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
features = [
|
| 16 |
+
similarity["pro_india"],
|
| 17 |
+
similarity["anti_india"],
|
| 18 |
+
similarity["pro_government"],
|
| 19 |
+
similarity["anti_government"],
|
| 20 |
+
similarity["neutral"],
|
| 21 |
+
sentiment[0], # negative
|
| 22 |
+
sentiment[1], # neutral
|
| 23 |
+
sentiment[2], # positive
|
| 24 |
+
sarcasm,
|
| 25 |
+
context_probs[0], # Political Criticism
|
| 26 |
+
context_probs[1], # National Criticism
|
| 27 |
+
context_probs[2], # Political Praise
|
| 28 |
+
context_probs[3] # National Praise
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
return np.array(features, dtype=np.float32)
|
src/language_detection.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langdetect import detect_langs, DetectorFactory
|
| 2 |
+
|
| 3 |
+
# Enforce determinism
|
| 4 |
+
DetectorFactory.seed = 0
|
| 5 |
+
|
| 6 |
+
def detect_language(text: str):
|
| 7 |
+
"""
|
| 8 |
+
Robust language detection for Reddit comments.
|
| 9 |
+
Prioritizes English for short text if common stopwords are found.
|
| 10 |
+
"""
|
| 11 |
+
# 1. Heuristic: Common English Stopwords
|
| 12 |
+
# Solves short-text issues like "India has deep flaws" being detected as Spanish
|
| 13 |
+
english_stopwords = {"the", "is", "are", "and", "of", "to", "in", "it", "has", "have", "for", "on", "with"}
|
| 14 |
+
words = set(text.lower().split())
|
| 15 |
+
|
| 16 |
+
# If intersection is non-empty, high confidence it's English
|
| 17 |
+
if words & english_stopwords:
|
| 18 |
+
return "en", 1.0
|
| 19 |
+
|
| 20 |
+
# 2. Statistical Detection (langdetect)
|
| 21 |
+
try:
|
| 22 |
+
# returns list of [Language(lang, prob), ...]
|
| 23 |
+
langs = detect_langs(text)
|
| 24 |
+
best = langs[0]
|
| 25 |
+
return best.lang, best.prob
|
| 26 |
+
except Exception:
|
| 27 |
+
# Fallback for empty/numeric text
|
| 28 |
+
return "unknown", 0.0
|
| 29 |
+
|
src/predict.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import joblib
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
print("predict module loaded")
|
| 5 |
+
|
| 6 |
+
MODEL_PATH = "models/final_classifier.pkl"
|
| 7 |
+
|
| 8 |
+
clf = joblib.load(MODEL_PATH)
|
| 9 |
+
|
| 10 |
+
def predict(features: np.ndarray):
|
| 11 |
+
"""
|
| 12 |
+
Predict stance label and confidence
|
| 13 |
+
"""
|
| 14 |
+
probs = clf.predict_proba([features])[0]
|
| 15 |
+
|
| 16 |
+
sorted_idx = np.argsort(probs)[::-1]
|
| 17 |
+
best = sorted_idx[0]
|
| 18 |
+
second = sorted_idx[1]
|
| 19 |
+
|
| 20 |
+
confidence = (probs[best] - probs[second]) / probs[best]
|
| 21 |
+
|
| 22 |
+
return best, float(confidence)
|
src/preprocessing.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
print("preprocessing module loaded")
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
def clean_text(text):
|
| 6 |
+
text = re.sub(r"http\S+", "", text)
|
| 7 |
+
text = re.sub(r"\s+", " ", text)
|
| 8 |
+
return text.strip()
|
| 9 |
+
import re
|
| 10 |
+
|
| 11 |
+
def clean_text(text: str) -> str:
|
| 12 |
+
"""
|
| 13 |
+
Basic text normalization for Reddit posts
|
| 14 |
+
"""
|
| 15 |
+
text = re.sub(r"http\S+", "", text) # remove URLs
|
| 16 |
+
text = re.sub(r"\s+", " ", text) # normalize spaces
|
| 17 |
+
return text.strip()
|
src/sarcasm.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 3 |
+
|
| 4 |
+
print("sarcasm module loaded (BERT Sarcasm Detector)")
|
| 5 |
+
|
| 6 |
+
# FIX: Use a Twitter-based Irony model (RoBERTa) which is better for social media/Reddit
|
| 7 |
+
MODEL_NAME = "cardiffnlp/twitter-roberta-base-irony"
|
| 8 |
+
|
| 9 |
+
try:
|
| 10 |
+
# FIX: Force use_fast=False to avoid Windows rust-tokenizer crashes
|
| 11 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
|
| 12 |
+
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
|
| 13 |
+
model.eval()
|
| 14 |
+
except Exception as e:
|
| 15 |
+
print(f"CRITICAL ERROR loading sarcasm model: {e}")
|
| 16 |
+
raise e
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def sarcasm_score(text: str) -> float:
|
| 20 |
+
"""
|
| 21 |
+
Deep sarcasm probability (0-1).
|
| 22 |
+
Uses helinivan/english-sarcasm-detector (BERT-based).
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
with torch.no_grad():
|
| 26 |
+
inputs = tokenizer(
|
| 27 |
+
text,
|
| 28 |
+
return_tensors="pt",
|
| 29 |
+
truncation=True,
|
| 30 |
+
padding=True,
|
| 31 |
+
max_length=128
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
outputs = model(**inputs)
|
| 35 |
+
probs = torch.softmax(outputs.logits, dim=1)
|
| 36 |
+
|
| 37 |
+
# The model 'helinivan/english-sarcasm-detector' labels:
|
| 38 |
+
# 0: Not Sarcastic
|
| 39 |
+
# 1: Sarcastic
|
| 40 |
+
# We want the probability of it being sarcastic (index 1)
|
| 41 |
+
return float(probs[0][1])
|
src/sentiment.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 3 |
+
|
| 4 |
+
print("sentiment module loaded (English RoBERTa)")
|
| 5 |
+
|
| 6 |
+
# FIX: Use the standard (older) model which definitely has support for slow tokenizers
|
| 7 |
+
# The 'latest' version sometimes lacks full file support for use_fast=False on all setups
|
| 8 |
+
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment"
|
| 9 |
+
|
| 10 |
+
try:
|
| 11 |
+
# FIX: Force use_fast=False to avoid Windows rust-tokenizer crashes
|
| 12 |
+
# This uses the stable Python-based tokenizer (Byte-Level BPE)
|
| 13 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
|
| 14 |
+
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
|
| 15 |
+
model.eval()
|
| 16 |
+
except Exception as e:
|
| 17 |
+
print(f"CRITICAL ERROR loading sentiment model: {e}")
|
| 18 |
+
raise e
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def sentiment_scores(text: str):
|
| 22 |
+
"""
|
| 23 |
+
Returns sentiment probabilities as:
|
| 24 |
+
[negative, neutral, positive]
|
| 25 |
+
"""
|
| 26 |
+
with torch.no_grad():
|
| 27 |
+
inputs = tokenizer(
|
| 28 |
+
text,
|
| 29 |
+
return_tensors="pt",
|
| 30 |
+
truncation=True,
|
| 31 |
+
padding=True,
|
| 32 |
+
max_length=128
|
| 33 |
+
)
|
| 34 |
+
outputs = model(**inputs)
|
| 35 |
+
probs = torch.softmax(outputs.logits, dim=1)
|
| 36 |
+
# Model returns: negative, neutral, positive
|
| 37 |
+
return probs[0].tolist()
|
src/train_classifier.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import joblib
|
| 2 |
+
from sklearn.linear_model import LogisticRegression
|
| 3 |
+
|
| 4 |
+
print("train_classifier module loaded")
|
| 5 |
+
|
| 6 |
+
def train_and_save(X, y):
|
| 7 |
+
"""
|
| 8 |
+
Train final stance classifier and save it
|
| 9 |
+
"""
|
| 10 |
+
clf = LogisticRegression(
|
| 11 |
+
max_iter=2000,
|
| 12 |
+
multi_class="multinomial"
|
| 13 |
+
)
|
| 14 |
+
clf.fit(X, y)
|
| 15 |
+
|
| 16 |
+
joblib.dump(clf, "models/final_classifier.pkl")
|
| 17 |
+
print("✅ Model trained and saved")
|
src/train_logic_aligned.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import joblib
|
| 3 |
+
import os
|
| 4 |
+
from sklearn.linear_model import LogisticRegression
|
| 5 |
+
|
| 6 |
+
# Output path
|
| 7 |
+
MODEL_PATH = os.path.join(os.path.dirname(__file__), "..", "models", "final_classifier.pkl")
|
| 8 |
+
os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)
|
| 9 |
+
|
| 10 |
+
print(">>> Generating Synthetic Logic-Aligned Training Data...")
|
| 11 |
+
|
| 12 |
+
# Features:
|
| 13 |
+
# 0: sim_pro_india
|
| 14 |
+
# 1: sim_anti_india
|
| 15 |
+
# 2: sim_pro_govt
|
| 16 |
+
# 3: sim_anti_govt
|
| 17 |
+
# 4: sim_neutral
|
| 18 |
+
# 5: neg
|
| 19 |
+
# 6: neu
|
| 20 |
+
# 7: pos
|
| 21 |
+
# 8: sarcasm
|
| 22 |
+
# 9: context_pol_crit (Anti-Govt)
|
| 23 |
+
# 10: context_nat_crit (Anti-India)
|
| 24 |
+
# 11: context_pol_praise (Pro-Govt)
|
| 25 |
+
# 12: context_nat_praise (Pro-India)
|
| 26 |
+
|
| 27 |
+
def generate_sample(label_idx):
|
| 28 |
+
# Base noise for 13 features
|
| 29 |
+
feats = np.random.uniform(0.0, 0.3, 13)
|
| 30 |
+
|
| 31 |
+
# 0: Pro-India
|
| 32 |
+
if label_idx == 0:
|
| 33 |
+
feats[0] = np.random.uniform(0.6, 1.0) # High Pro-India Sim
|
| 34 |
+
feats[7] = np.random.uniform(0.5, 1.0) # High Positive
|
| 35 |
+
feats[5] = np.random.uniform(0.0, 0.2) # Low Negative
|
| 36 |
+
feats[8] = np.random.uniform(0.0, 0.2) # IGNORE SARCASM
|
| 37 |
+
# LLM Context
|
| 38 |
+
feats[12] = np.random.uniform(0.7, 1.0) # High National Praise
|
| 39 |
+
feats[9] = np.random.uniform(0.0, 0.2) # Low Pol Crit
|
| 40 |
+
|
| 41 |
+
# 1: Anti-India
|
| 42 |
+
elif label_idx == 1:
|
| 43 |
+
feats[1] = np.random.uniform(0.6, 1.0) # High Anti-India Sim
|
| 44 |
+
feats[5] = np.random.uniform(0.5, 1.0) # High Negative
|
| 45 |
+
feats[7] = np.random.uniform(0.0, 0.2) # Low Positive
|
| 46 |
+
feats[8] = np.random.uniform(0.0, 0.2) # IGNORE SARCASM
|
| 47 |
+
# LLM Context
|
| 48 |
+
feats[10] = np.random.uniform(0.7, 1.0) # High National Criticism
|
| 49 |
+
feats[9] = np.random.uniform(0.0, 0.3) # Low/Med Pol Crit
|
| 50 |
+
|
| 51 |
+
# 2: Pro-Government
|
| 52 |
+
elif label_idx == 2:
|
| 53 |
+
feats[2] = np.random.uniform(0.6, 1.0) # High Pro-Govt Sim
|
| 54 |
+
feats[7] = np.random.uniform(0.5, 1.0) # High Positive
|
| 55 |
+
feats[5] = np.random.uniform(0.0, 0.2) # Low Negative
|
| 56 |
+
feats[8] = np.random.uniform(0.0, 0.2) # IGNORE SARCASM
|
| 57 |
+
# LLM Context
|
| 58 |
+
feats[11] = np.random.uniform(0.7, 1.0) # High Political Praise
|
| 59 |
+
feats[10] = np.random.uniform(0.0, 0.2) # Low Nat Crit
|
| 60 |
+
|
| 61 |
+
# 3: Anti-Government
|
| 62 |
+
elif label_idx == 3:
|
| 63 |
+
feats[3] = np.random.uniform(0.6, 1.0) # High Anti-Govt Sim
|
| 64 |
+
feats[5] = np.random.uniform(0.5, 1.0) # High Negative
|
| 65 |
+
feats[7] = np.random.uniform(0.0, 0.2) # Low Positive
|
| 66 |
+
feats[8] = np.random.uniform(0.0, 0.2) # IGNORE SARCASM
|
| 67 |
+
# LLM Context
|
| 68 |
+
feats[9] = np.random.uniform(0.7, 1.0) # High Political Criticism!
|
| 69 |
+
feats[10] = np.random.uniform(0.0, 0.4) # Low/Med Nat Crit
|
| 70 |
+
|
| 71 |
+
# 4: Neutral
|
| 72 |
+
elif label_idx == 4:
|
| 73 |
+
feats[4] = np.random.uniform(0.5, 1.0) # High Neutral Sim
|
| 74 |
+
feats[6] = np.random.uniform(0.5, 1.0) # High Neutral Sentiment
|
| 75 |
+
feats[5] = np.random.uniform(0.0, 0.2)
|
| 76 |
+
feats[7] = np.random.uniform(0.0, 0.2)
|
| 77 |
+
feats[8] = np.random.uniform(0.0, 0.1)
|
| 78 |
+
# LLM Context -> All low or balanced
|
| 79 |
+
feats[9] = np.random.uniform(0.0, 0.3)
|
| 80 |
+
feats[10] = np.random.uniform(0.0, 0.3)
|
| 81 |
+
|
| 82 |
+
return feats
|
| 83 |
+
|
| 84 |
+
# Generate data
|
| 85 |
+
X = []
|
| 86 |
+
y = []
|
| 87 |
+
SAMPLES_PER_CLASS = 500
|
| 88 |
+
|
| 89 |
+
for label in range(5):
|
| 90 |
+
for _ in range(SAMPLES_PER_CLASS):
|
| 91 |
+
X.append(generate_sample(label))
|
| 92 |
+
y.append(label)
|
| 93 |
+
|
| 94 |
+
X = np.array(X)
|
| 95 |
+
y = np.array(y)
|
| 96 |
+
|
| 97 |
+
print(f"Training Logistic Regression on {len(X)} synthetic samples (13 features)...")
|
| 98 |
+
clf = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
|
| 99 |
+
clf.fit(X, y)
|
| 100 |
+
|
| 101 |
+
print(f"Accuracy on Training Set: {clf.score(X, y):.4f}")
|
| 102 |
+
|
| 103 |
+
print(f"Saving model to {MODEL_PATH}...")
|
| 104 |
+
joblib.dump(clf, MODEL_PATH)
|
| 105 |
+
print("DONE.")
|
| 106 |
+
|
src/translation.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from deep_translator import GoogleTranslator
|
| 2 |
+
import time
|
| 3 |
+
|
| 4 |
+
def translate_to_english(text: str, source="auto") -> str:
|
| 5 |
+
"""
|
| 6 |
+
Translates input text to English using Google Translator.
|
| 7 |
+
Retries once on failure.
|
| 8 |
+
"""
|
| 9 |
+
try:
|
| 10 |
+
# Use simple 'auto' detection or specific source
|
| 11 |
+
translator = GoogleTranslator(source=source, target='en')
|
| 12 |
+
translated = translator.translate(text)
|
| 13 |
+
return translated
|
| 14 |
+
except Exception as e:
|
| 15 |
+
print(f"[WARNING] Translation failed: {e}")
|
| 16 |
+
# Fallback to original text if translation fails
|
| 17 |
+
return text
|
train_once.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from src.train_classifier import train_and_save
|
| 3 |
+
|
| 4 |
+
# DUMMY FEATURES (9 features as defined)
|
| 5 |
+
X = np.random.rand(20, 9)
|
| 6 |
+
|
| 7 |
+
# DUMMY LABELS (5 classes)
|
| 8 |
+
y = np.random.randint(0, 5, size=20)
|
| 9 |
+
|
| 10 |
+
train_and_save(X, y)
|