Quivara commited on
Commit
cbc9b35
·
verified ·
1 Parent(s): b2e963e

Update alisto_project/backend/ingest_reddit.py

Browse files
Files changed (1) hide show
  1. alisto_project/backend/ingest_reddit.py +247 -429
alisto_project/backend/ingest_reddit.py CHANGED
@@ -1,429 +1,247 @@
1
- import asyncpraw
2
- import asyncio
3
- import os
4
- import torch
5
- import pickle
6
- import numpy as np
7
- import torch.nn.functional as F
8
- from datetime import datetime
9
- from dotenv import load_dotenv
10
- from flask import Flask
11
- from models import db, DisasterPost
12
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
13
- from ner_extractor import extract_entities
14
-
15
- # 1. Config & Setup
16
- # defines the subreddits to be monitored by the scraper
17
- SUBREDDITS = "AlistoSimulation"
18
- # SUBREDDITS = "Philippines+NaturalDisasters+DisasterUpdatePH+Assistance+Typhoon+AlistoSimulation"
19
-
20
- BASE_DIR = os.path.dirname(os.path.abspath(__file__))
21
- # loads environment variables from .env file
22
- load_dotenv(os.path.join(BASE_DIR, '../.env'))
23
-
24
- # initializes the Flask application context for database access
25
- app = Flask(__name__)
26
- DB_PATH = os.path.join(BASE_DIR, 'alisto.db')
27
- app.config['SQLALCHEMY_DATABASE_URI'] = f'sqlite:///{DB_PATH}'
28
- app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
29
- # sets a timeout for stable database connection
30
- app.config['SQLALCHEMY_ENGINE_OPTIONS'] = {'connect_args': {'timeout': 15}}
31
- db.init_app(app)
32
-
33
- # 2. Load Models
34
- print("Loading ALISTO Brains...")
35
- MODEL_DIR = os.path.join(BASE_DIR, 'models')
36
- ROBERTA_DIR = os.path.join(MODEL_DIR, 'roberta_model')
37
- TFIDF_PATH = os.path.join(MODEL_DIR, 'tfidf_ensemble.pkl')
38
-
39
- # A. RoBERTa (XLM-R Multilingual)
40
- # loads the RoBERTa tokenizer and sequence classification model (Context Expert)
41
- try:
42
- tokenizer = AutoTokenizer.from_pretrained(ROBERTA_DIR)
43
- roberta_model = AutoModelForSequenceClassification.from_pretrained(ROBERTA_DIR)
44
- device = torch.device("cpu") # determines the device (CPU/GPU) for model execution
45
- roberta_model.to(device)
46
- roberta_model.eval()
47
- print("✅ Context Expert (XLM-R) loaded")
48
- except Exception as e:
49
- print(f"❌ Error loading RoBERTa: {e}")
50
- exit()
51
-
52
- # B. TF-IDF (The Gatekeeper)
53
- # loads the pre-trained TF-IDF vectorizer and ensemble model (Gatekeeper)
54
- try:
55
- with open(TFIDF_PATH, 'rb') as f:
56
- tfidf_model = pickle.load(f)
57
- print("✅ Gatekeeper (TF-IDF) loaded")
58
- except Exception as e:
59
- print(f"❌ Error loading TF-IDF: {e}")
60
- tfidf_model = None
61
-
62
- # 3. Reference Lists (Kept from your original)
63
- # list of Philippine locations used for basic geo-validation
64
- PHILIPPINE_LOCATIONS = [
65
- "Philippines", "PH", "Luzon", "Visayas", "Mindanao", "Metro Manila", "NCR",
66
- "Manila", "Quezon City", "Makati", "Taguig", "Pasig", "Mandaluyong",
67
- "Marikina", "Las Pinas", "Las Piñas", "Muntinlupa", "Caloocan",
68
- "Paranaque", "Parañaque", "Valenzuela", "Pasay", "Malabon",
69
- "Navotas", "San Juan", "Pateros",
70
- "Cavite", "Naic", "Bacoor", "Imus", "Dasmarinas", "Dasmariñas",
71
- "General Trias", "Tagaytay", "Kawit", "Noveleta", "Rosario", "Tanza",
72
- "Silang", "Trece Martires", "Laguna", "Calamba", "Santa Rosa", "Binan",
73
- "Biñan", "San Pedro", "Cabuyao", "Los Banos", "Los Baños", "Rizal",
74
- "Antipolo", "Cainta", "Taytay", "San Mateo", "Binangonan", "Batangas",
75
- "Bulacan", "Pampanga", "Tarlac", "Cebu", "Iloilo", "Tacloban",
76
- "Davao", "Cagayan", "Bicol", "Albay", "Isabela"
77
- ]
78
-
79
- # function to process a single Reddit submission through all filters and save it
80
- async def process_post(post):
81
- """handles logic for a single Reddit submission (filtering, AI, saving)"""
82
- try:
83
- full_text = f"{post.title} {post.selftext}"
84
-
85
- # A. Check for Duplicates & Credibility (Unchanged logic)
86
- # checks for existing post ID in the database
87
- with app.app_context():
88
- exists = DisasterPost.query.filter_by(reddit_id=post.id).first()
89
- if exists: return
90
- # blocks posts from suspicious new/low-karma accounts
91
- if not is_credible_user(post):
92
- print(f"\n------------------- DEBUG REJECTION -------------------")
93
- print(f"❌ REJECTED POST ID: {post.id} (Title: {post.title[:30]})")
94
- print(f"REASON: Credibility Check (Account too new/Low Karma)")
95
- print(f"---------------------------------------------------------\n")
96
- return
97
-
98
- # B. Logic Filter (First Defense) (Unchanged logic)
99
- # runs simple keyword checks to filter news/financial/irrelevant content
100
- is_bad, reason = is_news_or_irrelevant(full_text)
101
- if is_bad:
102
- print(f"\n------------------- DEBUG REJECTION -------------------")
103
- print(f"❌ REJECTED POST ID: {post.id} (Title: {post.title[:30]})")
104
- print(f"REASON: Logic Filter (Common Sense Layer) Categorized as: {reason}")
105
- print(f"---------------------------------------------------------\n")
106
- return
107
-
108
- # C. AI Analysis (Unchanged logic)
109
- # runs the cascade AI check (TF-IDF then RoBERTa)
110
- is_urgent, score, source = predict_urgency(full_text)
111
- if not is_urgent:
112
- print(f"\n------------------- DEBUG REJECTION -------------------")
113
- print(f"❌ REJECTED POST ID: {post.id} (Title: {post.title[:30]})")
114
- print(f"REASON: AI Confidence too low Score: {score:.2%} (Source: {source})")
115
- print(f"---------------------------------------------------------\n")
116
- return
117
-
118
- # D. Entity Extraction
119
- # extracts location, contact number, and contact person name
120
- ner_results = extract_entities(full_text)
121
- city_location = ner_results.get('location', "Unknown Location")
122
- full_raw_address = ner_results.get('full_address', "Check Post")
123
- contact_num = ner_results.get('contact', None)
124
- contact_person_name = ner_results.get('contact_person_name', None)
125
-
126
- # E. Final Triage and Data Preparation
127
- # assigns location and determines disaster/assistance type
128
- if isinstance(city_location, list):
129
- location = city_location[0] if city_location else "Unknown Location"
130
- else:
131
- location = city_location
132
-
133
- disaster_type = get_disaster_type(full_text)
134
- assistance_type = get_assistance_type(full_text)
135
-
136
- # 1. Calculate Dynamic Urgency (NEW)
137
- # assigns High, Medium, or Low urgency based on severity keywords
138
- dynamic_urgency = assign_dynamic_urgency(full_text)
139
-
140
- # 2. Finalize Author (Fallback Logic)
141
- # defaults to Reddit username if no contact name is explicitly extracted
142
- reddit_username = str(post.author) if post.author else "Unknown"
143
- final_author = contact_person_name if contact_person_name else reddit_username
144
-
145
- # 3. Print Final Alert Confirmation
146
- print(f"""------------------- ALERT SAVED -------------------\n🚨 ALERT ({score:.2%}): {disaster_type} in {location} Urgency: {dynamic_urgency} \n---------------------------------------------------------""")
147
-
148
- # F. Single Database Creation and Commit
149
- # creates and commits the final DisasterPost object to the database
150
- new_post = DisasterPost(
151
- reddit_id=post.id,
152
- title=post.title,
153
- content=post.selftext or post.title,
154
- author=final_author,
155
- location=location,
156
- full_address=full_raw_address,
157
- contact_number=contact_num,
158
- disaster_type=disaster_type,
159
- assistance_type=assistance_type,
160
- urgency_level=dynamic_urgency,
161
- is_help_request=True,
162
- timestamp=datetime.utcfromtimestamp(post.created_utc)
163
- )
164
-
165
- with app.app_context():
166
- db.session.add(new_post)
167
- db.session.commit()
168
-
169
- except Exception as e:
170
- print(f"Post Processing Error for {post.id}: {e}")
171
-
172
- # validates if the extracted location is relevant to the Philippines
173
- def check_for_philippine_location(location_list):
174
- if not location_list: return False
175
- ph_locations = [loc.lower() for loc in PHILIPPINE_LOCATIONS]
176
- for extracted_loc in location_list:
177
- # Check partial match (e.g., "Marikina City" matches "Marikina")
178
- for known_loc in ph_locations:
179
- if known_loc in extracted_loc.lower() or extracted_loc.lower() in known_loc:
180
- return True
181
- return False
182
-
183
- # classifies the type of disaster based on severity keywords
184
- def get_disaster_type(text):
185
- text_lower = text.lower()
186
- mapping = {
187
- "Earthquake": ["quake", "lindol", "shake", "aftershock"],
188
- "Landslide": ["landslide", "guho", "mudslide", "natabunan"],
189
- "Volcano": ["volcano", "lava", "ash", "magma", "taal", "mayon"],
190
- "Fire": ["fire", "sunog", "burn", "smoke"],
191
- "Typhoon": ["typhoon", "bagyo", "storm", "wind", "signal", "ulysses", "odette"],
192
- "Flood": ["flood", "baha", "water", "river", "drown", "lubog", "taas ng tubig"]
193
- }
194
-
195
- for dtype, keywords in mapping.items():
196
- if any(k in text_lower for k in keywords):
197
- return dtype
198
- return "General Emergency"
199
-
200
- # classifies the specific type of assistance needed (e.g., Medical, Rescue, Food)
201
- def get_assistance_type(text):
202
- """determines the specific help needed using Nested Priority"""
203
- text = text.lower()
204
-
205
- # --- 1. IMMEDIATE RESCUE (Life Threatening) ---
206
- rescue_kw = [
207
- "rescue", "saklolo", "trapped", "stuck", "stranded",
208
- "bubong", "roof", "boat", "bangka", "drowning", "lunod",
209
- "di makalabas", "unable to leave"
210
- ]
211
- if any(k in text for k in rescue_kw):
212
-
213
- critical_medical_override_kw = [
214
- "bleeding", "unconscious", "head injury", "head wound",
215
- "severely bleeding", "stroke", "heart attack", "trauma"
216
- ]
217
- if any(k in text for k in critical_medical_override_kw):
218
- return "Medical"
219
-
220
- return "Rescue" # if no critical medical keywords found
221
-
222
- # --- 2. MEDICAL (Specific Needs/Ambulance) ---
223
- # handles standalone medical needs if no rescue keywords were found
224
- medical_kw = [
225
- "medical", "doctor", "gamot", "medicine", "insulin", "dialysis",
226
- "hospital", "oxygen", "pregnant", "labor", "manganganak", "ambulance",
227
- "first aid", "pills", "medication"
228
- ]
229
- if any(k in text for k in medical_kw):
230
- return "Medical"
231
-
232
- # --- 3. EVACUATION (Shelter/Transport) ---
233
- # classifies the need for temporary shelter or transport
234
- evac_kw = [
235
- "evacuate", "evacuation", "shelter", "center", "likas", "tents",
236
- "matutuluyan", "alis", "transportation", "walang matutuluyan"
237
- ]
238
- if any(k in text for k in evac_kw):
239
- return "Evacuation"
240
-
241
- # --- 4. FOOD & WATER (Logistics) ---
242
- # classifies the need for essential supplies (food, water, formula)
243
- food_kw = [
244
- "food", "pagkain", "water", "tubig", "gutom", "hungry", "relief",
245
- "goods", "makakain", "inumin", "groceries", "supplies", "supply", "wala ng stock",
246
- "gatas", "milk", "formula", "baby supplies", "ubos na", "wala na", "stock", "stock ng"
247
- ]
248
- if any(k in text for k in food_kw):
249
- return "Food/Water"
250
-
251
- return "General Assistance"
252
-
253
-
254
- # --- LOGIC FILTERS (The "Common Sense" Layer) ---
255
- # runs simple logic checks to filter out news reports and non-urgent context
256
- def is_news_or_irrelevant(text):
257
- text_lower = text.lower()
258
-
259
- # 1. NEWS & REPORTS
260
- news_indicators = [
261
- "breaking:", "just in:", "news:", "update:", "report:",
262
- "casualties", "death toll", "according to", "reported that",
263
- "suspension", "declared", "signal no", "public advisory",
264
- "weather update", "volcano alert", "mmda", "pagasa"
265
- ]
266
-
267
- # 2. MONEY / SELLING
268
- financial_indicators = [
269
- "gcash", "paypal", "budget", "loan", "selling",
270
- "fundraising", "donate", "send funds"
271
- ]
272
-
273
- # 3. IRRELEVANT CONTEXT
274
- irrelevant_contexts = [
275
- "how can i help", "where to donate", "thoughts and prayers",
276
- "keep safe", "god bless", "praying for", "discussion:", "opinion:"
277
- ]
278
-
279
- # Logic Checks
280
- if any(ind in text_lower for ind in news_indicators):
281
- return True, "News/Report"
282
-
283
- # blocks financial requests unless life-threatening keywords are also present
284
- has_financial = any(ind in text_lower for ind in financial_indicators)
285
- is_life_death = any(k in text_lower for k in ["trapped", "lubog", "roof", "rescue", "drowning", "stuck"])
286
-
287
- if has_financial and not is_life_death:
288
- return True, "Financial/Non-Urgent"
289
-
290
- # blocks posts containing non-urgent discussion or commentary
291
- if any(ctx in text_lower for ctx in irrelevant_contexts):
292
- return True, "Context/NotUrgent"
293
-
294
- return False, None
295
-
296
- # runs the two-stage AI classification check (TF-IDF then RoBERTa)
297
- def predict_urgency(text):
298
-
299
- # 1. Gatekeeper (TF-IDF)
300
- # quickly rejects posts with extremely low urgency confidence (below 10%)
301
- if tfidf_model:
302
- tfidf_probs = tfidf_model.predict_proba([text])[0]
303
- tfidf_conf = tfidf_probs[1]
304
-
305
- # If the fast model is sure it's junk, skip the heavy lifting
306
- if tfidf_conf < 0.20:
307
- return False, tfidf_conf, "TF-IDF Reject"
308
-
309
- # 2. Context Expert (RoBERTa)
310
- # runs the slower, context-aware model for final classification
311
- inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
312
- with torch.no_grad():
313
- outputs = roberta_model(**inputs)
314
- probs = F.softmax(outputs.logits, dim=-1)
315
- roberta_conf = probs[0][1].item() # Probability of 'Rescue Request'
316
-
317
- # final acceptance threshold (40%) for the RoBERTa model
318
- return (roberta_conf > 0.4), roberta_conf, "RoBERTa"
319
-
320
- # assigns the final severity level (High, Medium, Low) based on severity keywords
321
- def assign_dynamic_urgency(text):
322
- text_lower = text.lower()
323
-
324
- # 1. HIGH URGENCY (Immediate Life-Threatening Event or Critical Medical Need)
325
- high_keywords = [
326
- "bleeding", "unconscious", "severely injured", "severe injury", "life threatening",
327
- "insulin", "oxygen", "ambulance", "urgent medicine", "doctor", "hospital",
328
-
329
- "trap", "trapped", "bubong", "collapsed", "di mapigilan", "drowning",
330
- "lampas tao", "lubog", "delikado", "baha na", "mamatay"
331
- ]
332
- if any(k in text_lower for k in high_keywords):
333
- return "High"
334
-
335
- # 2. MEDIUM URGENCY (Time-Sensitive, Logistical Crisis)
336
- medium_keywords = [
337
- "stranded", "running out", "evacuate", "kailangan agad", "lowbat",
338
- "paubos", "senior", "bedridden", "disabled", "gatas", "formula"
339
- ]
340
- if any(k in text_lower for k in medium_keywords):
341
- return "Medium"
342
-
343
- # 3. LOW URGENCY (General Supplies/Warning)
344
- # posts that pass the AI but lack the above severity indicators fall here
345
- return "Low"
346
-
347
- # blocks posts from accounts created less than 2 days ago or with negative karma
348
- def is_credible_user(post):
349
- try:
350
- author = post.author
351
-
352
- # checks if author is deleted or unknown
353
- if not author:
354
- return False
355
-
356
- # 1. Check Account Age (Must be older than 2 days)
357
- created_time = datetime.utcfromtimestamp(author.created_utc)
358
- account_age = datetime.utcnow() - created_time
359
-
360
- if account_age.days < 2:
361
- print(f"   ⚠️ Blocked: Account too new ({account_age.days} days)")
362
- return False
363
-
364
- # 2. Check Karma (Must not be negative)
365
- total_karma = author.comment_karma + author.link_karma
366
- if total_karma < -5:
367
- print(f"   ⚠️ Blocked: Negative Karma ({total_karma})")
368
- return False
369
-
370
- return True
371
-
372
- except Exception as e:
373
- # allows posts to pass if Reddit API fails to get user info
374
- return True
375
-
376
-
377
- # 4. Main Scraper Loop
378
- # orchestrates the entire scraping process (historical scan + real-time stream)
379
- async def scrape_reddit():
380
- print("Connecting to Reddit API...")
381
-
382
- client_id = os.getenv("REDDIT_CLIENT_ID")
383
- client_secret = os.getenv("REDDIT_CLIENT_SECRET")
384
-
385
- if not client_id or not client_secret:
386
- print("❌ Error: Client ID or Secret missing in .env")
387
- return
388
-
389
- # initializes PRAW using the secure Client Credentials Flow (read-only)
390
- reddit = asyncpraw.Reddit(
391
- client_id=client_id,
392
- client_secret=client_secret,
393
- user_agent=os.getenv("REDDIT_USER_AGENT", "script:alisto_bot:v3.0")
394
- )
395
-
396
- try:
397
- subreddit = await reddit.subreddit(SUBREDDITS)
398
- print(f"👁️  ALISTO ACTIVE: Monitoring r/{SUBREDDITS}...")
399
-
400
- # --- PHASE 1: FETCH LATEST EXISTING POSTS (e.g., last 500) ---
401
- print("🔍 Scanning last 500 posts for missed alerts...")
402
- # iterates over the last 500 posts asynchronously
403
- async for post in subreddit.new(limit=500):
404
- await process_post(post)
405
-
406
- print("✅ Historical scan complete")
407
-
408
- # --- PHASE 2: START REAL-TIME STREAM (Forever Loop) ---
409
- print("📡 Starting real-time stream for new submissions...")
410
-
411
- # starts the continuous loop to monitor for new submissions
412
- async for post in subreddit.stream.submissions(skip_existing=False):
413
- await process_post(post)
414
-
415
- except Exception as e:
416
- print(f"Global Scraper Error: {e}")
417
- finally:
418
- await reddit.close()
419
- print("Scraper stopped")
420
-
421
-
422
- # executes the main scraping loop when the script is run
423
- if __name__ == "__main__":
424
- try:
425
- loop = asyncio.new_event_loop()
426
- asyncio.set_event_loop(loop)
427
- loop.run_until_complete(scrape_reddit())
428
- except KeyboardInterrupt:
429
- print("\n🛑 Stopped by user")
 
1
+ import asyncpraw
2
+ import asyncio
3
+ import os
4
+ import torch
5
+ import pickle
6
+ import sys
7
+ from datetime import datetime
8
+ from dotenv import load_dotenv
9
+ from flask import Flask
10
+ from models import db, DisasterPost
11
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
12
+ from ner_extractor import extract_entities
13
+ from huggingface_hub import hf_hub_download
14
+
15
+ # Force prints to appear immediately in Hugging Face logs
16
+ def log(msg):
17
+ print(msg, flush=True)
18
+
19
+ log("🚀 INGEST SCRIPT LAUNCHED! Initializing...")
20
+
21
+ # 1. Config & Setup
22
+ SUBREDDITS = "AlistoSimulation"
23
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
24
+
25
+ # Load .env (Try multiple locations)
26
+ env_path_1 = os.path.join(BASE_DIR, '../.env')
27
+ if os.path.exists(env_path_1):
28
+ load_dotenv(env_path_1)
29
+ log("✅ Loaded .env from alisto_project folder")
30
+ else:
31
+ log("⚠️ No .env file found in alisto_project folder")
32
+
33
+ app = Flask(__name__)
34
+ DB_PATH = os.path.join(BASE_DIR, 'alisto.db')
35
+ app.config['SQLALCHEMY_DATABASE_URI'] = f'sqlite:///{DB_PATH}'
36
+ app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
37
+ app.config['SQLALCHEMY_ENGINE_OPTIONS'] = {'connect_args': {'timeout': 15}}
38
+ db.init_app(app)
39
+
40
+ # 2. Load Models
41
+ # FIXED: Points to the Cloud Repository, not a local folder
42
+ MODEL_ID = "Quivara/alisto-brain"
43
+ log("🧠 Loading ALISTO Brains from Cloud (This takes 1-2 mins)...")
44
+
45
+ try:
46
+ # Load Tokenizer & Model from Hugging Face Hub
47
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, subfolder="roberta_model")
48
+ roberta_model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, subfolder="roberta_model", num_labels=2)
49
+
50
+ device = torch.device("cpu")
51
+ roberta_model.to(device)
52
+ roberta_model.eval()
53
+ log(f"✅ Context Expert loaded from {MODEL_ID}")
54
+
55
+ except Exception as e:
56
+ log(f"❌ Error loading Model: {e}")
57
+ # We exit here because the app is useless without the brain
58
+ sys.exit(1)
59
+
60
+ try:
61
+ log("📥 Downloading Gatekeeper (TF-IDF)...")
62
+ tfidf_path = hf_hub_download(repo_id=MODEL_ID, filename="tfidf_ensemble.pkl")
63
+ with open(tfidf_path, 'rb') as f:
64
+ tfidf_model = pickle.load(f)
65
+ log(" Gatekeeper (TF-IDF) loaded")
66
+ except Exception as e:
67
+ log(f" Error loading TF-IDF (Ignore warnings): {e}")
68
+ tfidf_model = None
69
+
70
+ # 3. Helpers (Logic & Filters)
71
+ PHILIPPINE_LOCATIONS = [
72
+ "Philippines", "PH", "Luzon", "Visayas", "Mindanao", "Metro Manila", "NCR",
73
+ "Manila", "Quezon City", "Makati", "Taguig", "Pasig", "Mandaluyong",
74
+ "Marikina", "Las Pinas", "Las Piñas", "Muntinlupa", "Caloocan",
75
+ "Paranaque", "Parañaque", "Valenzuela", "Pasay", "Malabon",
76
+ "Navotas", "San Juan", "Pateros",
77
+ "Cavite", "Naic", "Bacoor", "Imus", "Dasmarinas", "Dasmariñas",
78
+ "General Trias", "Tagaytay", "Kawit", "Noveleta", "Rosario", "Tanza",
79
+ "Silang", "Trece Martires", "Laguna", "Calamba", "Santa Rosa", "Binan",
80
+ "Biñan", "San Pedro", "Cabuyao", "Los Banos", "Los Baños", "Rizal",
81
+ "Antipolo", "Cainta", "Taytay", "San Mateo", "Binangonan", "Batangas",
82
+ "Bulacan", "Pampanga", "Tarlac", "Cebu", "Iloilo", "Tacloban",
83
+ "Davao", "Cagayan", "Bicol", "Albay", "Isabela"
84
+ ]
85
+
86
+ def is_news_or_irrelevant(text):
87
+ text_lower = text.lower()
88
+ news_indicators = ["breaking:", "just in:", "news:", "update:", "report:", "mmda", "pagasa"]
89
+ financial_indicators = ["gcash", "paypal", "budget", "loan", "selling", "donate"]
90
+ irrelevant_contexts = ["how can i help", "thoughts and prayers", "discussion:", "opinion:"]
91
+
92
+ if any(ind in text_lower for ind in news_indicators): return True, "News/Report"
93
+
94
+ has_financial = any(ind in text_lower for ind in financial_indicators)
95
+ is_life_death = any(k in text_lower for k in ["trapped", "lubog", "roof", "rescue", "drowning"])
96
+ if has_financial and not is_life_death: return True, "Financial/Non-Urgent"
97
+
98
+ if any(ctx in text_lower for ctx in irrelevant_contexts): return True, "Context/NotUrgent"
99
+ return False, None
100
+
101
+ def predict_urgency(text):
102
+ if tfidf_model:
103
+ tfidf_probs = tfidf_model.predict_proba([text])[0]
104
+ if tfidf_probs[1] < 0.20: return False, tfidf_probs[1], "TF-IDF Reject"
105
+
106
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
107
+ with torch.no_grad():
108
+ outputs = roberta_model(**inputs)
109
+ probs = F.softmax(outputs.logits, dim=-1)
110
+ roberta_conf = probs[0][1].item()
111
+ return (roberta_conf > 0.4), roberta_conf, "RoBERTa"
112
+
113
+ def get_disaster_type(text):
114
+ text_lower = text.lower()
115
+ mapping = {
116
+ "Earthquake": ["quake", "lindol", "shake"], "Landslide": ["landslide", "guho"],
117
+ "Volcano": ["volcano", "lava", "ash", "taal"], "Fire": ["fire", "sunog", "burn"],
118
+ "Typhoon": ["typhoon", "bagyo", "storm"], "Flood": ["flood", "baha", "water", "lubog"]
119
+ }
120
+ for dtype, keywords in mapping.items():
121
+ if any(k in text_lower for k in keywords): return dtype
122
+ return "General Emergency"
123
+
124
+ def get_assistance_type(text):
125
+ text = text.lower()
126
+ if any(k in text for k in ["rescue", "trapped", "roof"]): return "Rescue"
127
+ if any(k in text for k in ["medical", "doctor", "hospital"]): return "Medical"
128
+ if any(k in text for k in ["evacuate", "shelter"]): return "Evacuation"
129
+ if any(k in text for k in ["food", "water"]): return "Food/Water"
130
+ return "General Assistance"
131
+
132
+ def assign_dynamic_urgency(text):
133
+ text_lower = text.lower()
134
+ high_keywords = ["bleeding", "unconscious", "life threatening", "trap", "trapped", "drowning", "lubog"]
135
+ medium_keywords = ["stranded", "running out", "evacuate", "lowbat", "senior"]
136
+ if any(k in text_lower for k in high_keywords): return "High"
137
+ if any(k in text_lower for k in medium_keywords): return "Medium"
138
+ return "Low"
139
+
140
+ # 4. Processing Logic
141
+ async def process_post(post):
142
+ try:
143
+ full_text = f"{post.title} {post.selftext}"
144
+
145
+ with app.app_context():
146
+ exists = DisasterPost.query.filter_by(reddit_id=post.id).first()
147
+ if exists: return
148
+
149
+ # Filters
150
+ is_bad, reason = is_news_or_irrelevant(full_text)
151
+ if is_bad: return
152
+
153
+ is_urgent, score, source = predict_urgency(full_text)
154
+ if not is_urgent: return
155
+
156
+ # Extraction
157
+ ner_results = extract_entities(full_text)
158
+ city_location = ner_results.get('location', "Unknown Location")
159
+ if isinstance(city_location, list): location = city_location[0] if city_location else "Unknown Location"
160
+ else: location = city_location
161
+
162
+ disaster_type = get_disaster_type(full_text)
163
+ dynamic_urgency = assign_dynamic_urgency(full_text)
164
+
165
+ # Determine Author
166
+ contact_person = ner_results.get('contact_person_name', None)
167
+ final_author = contact_person if contact_person else str(post.author)
168
+
169
+ log(f"🚨 ALERT SAVED: {disaster_type} in {location} ({dynamic_urgency})")
170
+
171
+ # Save to DB
172
+ new_post = DisasterPost(
173
+ reddit_id=post.id,
174
+ title=post.title,
175
+ content=post.selftext or post.title,
176
+ author=final_author,
177
+ location=location,
178
+ full_address=ner_results.get('full_address', "Check Post"),
179
+ contact_number=ner_results.get('contact', None),
180
+ disaster_type=disaster_type,
181
+ assistance_type=get_assistance_type(full_text),
182
+ urgency_level=dynamic_urgency,
183
+ is_help_request=True,
184
+ status='New',
185
+ timestamp=datetime.utcfromtimestamp(post.created_utc)
186
+ )
187
+
188
+ with app.app_context():
189
+ db.session.add(new_post)
190
+ db.session.commit()
191
+
192
+ except Exception as e:
193
+ log(f"Processing Error: {e}")
194
+
195
+ # 5. Main Loop (POLLING MODE - The Fix for Hugging Face)
196
+ async def scrape_reddit():
197
+ log("🔌 Connecting to Reddit API (Polling Mode)...")
198
+
199
+ client_id = os.getenv("REDDIT_CLIENT_ID")
200
+ client_secret = os.getenv("REDDIT_CLIENT_SECRET")
201
+
202
+ if not client_id or not client_secret:
203
+ log("❌ CRITICAL ERROR: Client ID or Secret missing in .env")
204
+ return
205
+
206
+ # Authenticate
207
+ reddit = asyncpraw.Reddit(
208
+ client_id=client_id,
209
+ client_secret=client_secret,
210
+ user_agent=os.getenv("REDDIT_USER_AGENT"),
211
+ username=os.getenv("REDDIT_USERNAME"),
212
+ password=os.getenv("REDDIT_PASSWORD")
213
+ )
214
+
215
+ log(f"👁️ ALISTO ACTIVE: Polling r/{SUBREDDITS} every 60s...")
216
+ last_id = None
217
+
218
+ while True:
219
+ try:
220
+ subreddit = await reddit.subreddit(SUBREDDITS)
221
+
222
+ # Fetch ONLY 1 post to minimize bandwidth and look like a human
223
+ async for post in subreddit.new(limit=1):
224
+ if post.id != last_id:
225
+ log(f"📥 New Post Detected: {post.title}")
226
+ await process_post(post)
227
+ last_id = post.id
228
+ else:
229
+ # Silence "no new post" messages to keep logs clean
230
+ pass
231
+
232
+ # Wait 60 seconds (The Fix for 403 Error)
233
+ await asyncio.sleep(60)
234
+
235
+ except Exception as e:
236
+ log(f"⚠️ Connection glitch (Retrying in 2m): {e}")
237
+ await asyncio.sleep(120)
238
+
239
+ await reddit.close()
240
+
241
+ if __name__ == "__main__":
242
+ try:
243
+ loop = asyncio.new_event_loop()
244
+ asyncio.set_event_loop(loop)
245
+ loop.run_until_complete(scrape_reddit())
246
+ except KeyboardInterrupt:
247
+ log("\n🛑 Stopped by user")