Quivara commited on
Commit
980b953
·
verified ·
1 Parent(s): 8a49862

Update alisto_project/backend/ingest_reddit.py

Browse files
Files changed (1) hide show
  1. alisto_project/backend/ingest_reddit.py +346 -101
alisto_project/backend/ingest_reddit.py CHANGED
@@ -3,7 +3,8 @@ import asyncio
3
  import os
4
  import torch
5
  import pickle
6
- import sys
 
7
  from datetime import datetime
8
  from dotenv import load_dotenv
9
  from flask import Flask
@@ -12,125 +13,164 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
12
  from ner_extractor import extract_entities
13
  from huggingface_hub import hf_hub_download
14
 
15
- # Force prints to appear immediately in Hugging Face logs
16
- def log(msg):
17
- print(msg, flush=True)
18
-
19
- log("🚀 INGEST SCRIPT LAUNCHED! Initializing...")
20
-
21
  # 1. Config & Setup
 
22
  SUBREDDITS = "AlistoSimulation"
 
 
23
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 
 
 
24
 
25
- # Load .env (Try multiple locations)
26
- env_path_1 = os.path.join(BASE_DIR, '../.env')
27
  if os.path.exists(env_path_1):
28
  load_dotenv(env_path_1)
29
- log("✅ Loaded .env from alisto_project folder")
 
 
 
30
  else:
31
- log("⚠️ No .env file found in alisto_project folder")
32
 
 
33
  app = Flask(__name__)
34
  DB_PATH = os.path.join(BASE_DIR, 'alisto.db')
35
  app.config['SQLALCHEMY_DATABASE_URI'] = f'sqlite:///{DB_PATH}'
36
  app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
 
37
  app.config['SQLALCHEMY_ENGINE_OPTIONS'] = {'connect_args': {'timeout': 15}}
38
  db.init_app(app)
39
 
40
  # 2. Load Models
41
- log("🧠 Loading ALISTO Brains from Cloud (This takes 1-2 mins)...")
 
42
  MODEL_ID = "Quivara/alisto-brain"
43
 
44
  try:
 
45
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, subfolder="roberta_model")
 
 
46
  roberta_model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, subfolder="roberta_model", num_labels=2)
 
47
  device = torch.device("cpu")
48
  roberta_model.to(device)
49
  roberta_model.eval()
50
- log(f"✅ Context Expert loaded from {MODEL_ID}")
 
51
  except Exception as e:
52
- log(f"❌ Error loading Model: {e}")
 
 
53
 
 
 
54
  try:
55
- log("📥 Downloading Gatekeeper (TF-IDF)...")
 
56
  tfidf_path = hf_hub_download(repo_id=MODEL_ID, filename="tfidf_ensemble.pkl")
 
57
  with open(tfidf_path, 'rb') as f:
58
  tfidf_model = pickle.load(f)
59
- log("✅ Gatekeeper (TF-IDF) loaded")
60
  except Exception as e:
61
- log(f"❌ Error loading TF-IDF (Ignore warnings): {e}")
62
  tfidf_model = None
63
 
64
- # 3. Helpers
65
- def is_news_or_irrelevant(text):
66
- text = text.lower()
67
- if any(x in text for x in ["breaking:", "news:", "update:", "selling", "donate"]): return True, "Irrelevant"
68
- return False, None
69
-
70
- def predict_urgency(text):
71
- if tfidf_model and tfidf_model.predict_proba([text])[0][1] < 0.20: return False, 0, "TF-IDF"
72
- inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
73
- with torch.no_grad():
74
- score = F.softmax(roberta_model(**inputs).logits, dim=-1)[0][1].item()
75
- return (score > 0.4), score, "RoBERTa"
76
-
77
- def get_disaster_type(text):
78
- text = text.lower()
79
- if any(x in text for x in ["flood", "baha", "water"]): return "Flood"
80
- if any(x in text for x in ["fire", "sunog"]): return "Fire"
81
- if any(x in text for x in ["quake", "lindol"]): return "Earthquake"
82
- return "General Emergency"
83
-
84
- def get_assistance_type(text):
85
- text = text.lower()
86
- if any(x in text for x in ["rescue", "roof", "trapped"]): return "Rescue"
87
- if any(x in text for x in ["medical", "doctor"]): return "Medical"
88
- return "General Assistance"
89
-
90
- def assign_dynamic_urgency(text):
91
- text = text.lower()
92
- if any(x in text for x in ["trapped", "bleeding", "drowning"]): return "High"
93
- if "stranded" in text: return "Medium"
94
- return "Low"
95
-
96
- def extract_entities_wrapper(text):
97
- res = extract_entities(text)
98
- loc = res.get('locations', ["Unknown"])[0] if res.get('locations') else "Unknown Location"
99
- return loc, res.get('contact')
100
 
101
- # 4. Processing Logic
102
  async def process_post(post):
 
103
  try:
104
  full_text = f"{post.title} {post.selftext}"
105
 
 
 
106
  with app.app_context():
107
  exists = DisasterPost.query.filter_by(reddit_id=post.id).first()
108
  if exists: return
109
-
 
 
 
 
 
 
 
 
 
110
  is_bad, reason = is_news_or_irrelevant(full_text)
111
- if is_bad: return
 
 
 
 
 
112
 
 
 
113
  is_urgent, score, source = predict_urgency(full_text)
114
- if not is_urgent: return
 
 
 
 
 
115
 
116
- location, contact = extract_entities_wrapper(full_text)
 
 
 
 
 
 
 
 
 
117
  disaster_type = get_disaster_type(full_text)
 
 
 
 
118
  dynamic_urgency = assign_dynamic_urgency(full_text)
119
 
120
- log(f"🚨 ALERT SAVED: {disaster_type} in {location} ({dynamic_urgency})")
 
 
 
 
 
 
121
 
 
 
122
  new_post = DisasterPost(
123
  reddit_id=post.id,
124
  title=post.title,
125
  content=post.selftext or post.title,
126
- author=str(post.author),
127
  location=location,
128
- contact_number=contact,
129
  disaster_type=disaster_type,
130
- assistance_type=get_assistance_type(full_text),
131
  urgency_level=dynamic_urgency,
132
  is_help_request=True,
133
- status='New',
134
  timestamp=datetime.utcfromtimestamp(post.created_utc)
135
  )
136
 
@@ -139,60 +179,265 @@ async def process_post(post):
139
  db.session.commit()
140
 
141
  except Exception as e:
142
- log(f"Processing Error: {e}")
143
 
144
- # 5. Main Loop (POLLING MODE)
145
- # 5. Main Loop (DEBUG MODE)
146
- async def scrape_reddit():
147
- log("🔌 Connecting to Reddit API (Debug Mode)...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  client_id = os.getenv("REDDIT_CLIENT_ID")
150
  client_secret = os.getenv("REDDIT_CLIENT_SECRET")
151
-
152
- # 1. Initialize Reddit
 
 
 
 
153
  reddit = asyncpraw.Reddit(
154
- client_id=client_id,
155
- client_secret=client_secret,
156
  user_agent=os.getenv("REDDIT_USER_AGENT"),
157
  username=os.getenv("REDDIT_USERNAME"),
158
  password=os.getenv("REDDIT_PASSWORD")
159
- )
160
-
161
- # --- THE TRUTH CHECK (Fixed) ---
162
- log(f"🧐 DEBUG CHECK: Am I logged in as {os.getenv('REDDIT_USERNAME')}?")
163
-
164
- # NO 'await' here. Just read the value directly.
165
- is_read_only = reddit.read_only
166
-
167
- log(f"👉 IS READ ONLY MODE? {is_read_only}")
168
- # True = Password Failed (You are Anonymous -> BLOCKED)
169
- # False = Password Worked (You are Logged In)
170
- # -----------------------
171
 
172
- log(f"👁️ ALISTO ACTIVE: Polling r/{SUBREDDITS}...")
173
-
174
- last_id = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
- while True:
177
- try:
178
- subreddit = await reddit.subreddit(SUBREDDITS)
179
- async for post in subreddit.new(limit=1):
180
- if post.id != last_id:
181
- log(f"📥 New Post: {post.title}")
182
- await process_post(post)
183
- last_id = post.id
184
- await asyncio.sleep(60)
185
 
186
- except Exception as e:
187
- log(f"⚠️ Connection glitch: {e}")
188
- await asyncio.sleep(120)
189
 
190
- await reddit.close()
191
 
 
192
  if __name__ == "__main__":
193
  try:
194
  loop = asyncio.new_event_loop()
195
  asyncio.set_event_loop(loop)
196
  loop.run_until_complete(scrape_reddit())
197
  except KeyboardInterrupt:
198
- log("\n🛑 Stopped by user")
 
3
  import os
4
  import torch
5
  import pickle
6
+ import numpy as np
7
+ import torch.nn.functional as F
8
  from datetime import datetime
9
  from dotenv import load_dotenv
10
  from flask import Flask
 
13
  from ner_extractor import extract_entities
14
  from huggingface_hub import hf_hub_download
15
 
 
 
 
 
 
 
16
  # 1. Config & Setup
17
+ # defines the subreddits to be monitored by the scraper
18
  SUBREDDITS = "AlistoSimulation"
19
+ # SUBREDDITS = "Philippines+NaturalDisasters+DisasterUpdatePH+Assistance+Typhoon+AlistoSimulation"
20
+
21
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
22
+ # loads environment variables from .env file
23
+ env_path_1 = os.path.join(BASE_DIR, '../.env') # Inside alisto_project
24
+ env_path_2 = os.path.join(BASE_DIR, '../../.env') # In the main root
25
 
 
 
26
  if os.path.exists(env_path_1):
27
  load_dotenv(env_path_1)
28
+ print("✅ Loaded .env from alisto_project folder")
29
+ elif os.path.exists(env_path_2):
30
+ load_dotenv(env_path_2)
31
+ print("✅ Loaded .env from Root folder")
32
  else:
33
+ print("⚠️ WARNING: No .env file found! Passwords will be missing.")
34
 
35
+ # initializes the Flask application context for database access
36
  app = Flask(__name__)
37
  DB_PATH = os.path.join(BASE_DIR, 'alisto.db')
38
  app.config['SQLALCHEMY_DATABASE_URI'] = f'sqlite:///{DB_PATH}'
39
  app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
40
+ # sets a timeout for stable database connection
41
  app.config['SQLALCHEMY_ENGINE_OPTIONS'] = {'connect_args': {'timeout': 15}}
42
  db.init_app(app)
43
 
44
  # 2. Load Models
45
+ print("Loading ALISTO Brains from Cloud...")
46
+
47
  MODEL_ID = "Quivara/alisto-brain"
48
 
49
  try:
50
+ # Load Tokenizer (Add subfolder argument)
51
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, subfolder="roberta_model")
52
+
53
+ # Load Model (Add subfolder argument)
54
  roberta_model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, subfolder="roberta_model", num_labels=2)
55
+
56
  device = torch.device("cpu")
57
  roberta_model.to(device)
58
  roberta_model.eval()
59
+ print(f"✅ Context Expert loaded from {MODEL_ID} (roberta_model folder)")
60
+
61
  except Exception as e:
62
+ print(f"❌ Error loading Model: {e}")
63
+ # Emergency Fallback to generic model so app doesn't crash
64
+ exit()
65
 
66
+
67
+ # B. TF-IDF (The Gatekeeper)
68
  try:
69
+ print("Downloading Gatekeeper (TF-IDF)...")
70
+ # TF-IDF is likely in the root, so no subfolder needed
71
  tfidf_path = hf_hub_download(repo_id=MODEL_ID, filename="tfidf_ensemble.pkl")
72
+
73
  with open(tfidf_path, 'rb') as f:
74
  tfidf_model = pickle.load(f)
75
+ print("✅ Gatekeeper (TF-IDF) loaded")
76
  except Exception as e:
77
+ print(f"❌ Error loading TF-IDF: {e}")
78
  tfidf_model = None
79
 
80
+ # 3. Reference Lists (Kept from your original)
81
+ # list of Philippine locations used for basic geo-validation
82
+ PHILIPPINE_LOCATIONS = [
83
+ "Philippines", "PH", "Luzon", "Visayas", "Mindanao", "Metro Manila", "NCR",
84
+ "Manila", "Quezon City", "Makati", "Taguig", "Pasig", "Mandaluyong",
85
+ "Marikina", "Las Pinas", "Las Piñas", "Muntinlupa", "Caloocan",
86
+ "Paranaque", "Parañaque", "Valenzuela", "Pasay", "Malabon",
87
+ "Navotas", "San Juan", "Pateros",
88
+ "Cavite", "Naic", "Bacoor", "Imus", "Dasmarinas", "Dasmariñas",
89
+ "General Trias", "Tagaytay", "Kawit", "Noveleta", "Rosario", "Tanza",
90
+ "Silang", "Trece Martires", "Laguna", "Calamba", "Santa Rosa", "Binan",
91
+ "Biñan", "San Pedro", "Cabuyao", "Los Banos", "Los Baños", "Rizal",
92
+ "Antipolo", "Cainta", "Taytay", "San Mateo", "Binangonan", "Batangas",
93
+ "Bulacan", "Pampanga", "Tarlac", "Cebu", "Iloilo", "Tacloban",
94
+ "Davao", "Cagayan", "Bicol", "Albay", "Isabela"
95
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
+ # function to process a single Reddit submission through all filters and save it
98
  async def process_post(post):
99
+ """handles logic for a single Reddit submission (filtering, AI, saving)"""
100
  try:
101
  full_text = f"{post.title} {post.selftext}"
102
 
103
+ # A. Check for Duplicates & Credibility (Unchanged logic)
104
+ # checks for existing post ID in the database
105
  with app.app_context():
106
  exists = DisasterPost.query.filter_by(reddit_id=post.id).first()
107
  if exists: return
108
+ # blocks posts from suspicious new/low-karma accounts
109
+ if not is_credible_user(post):
110
+ print(f"\n------------------- DEBUG REJECTION -------------------")
111
+ print(f"❌ REJECTED POST ID: {post.id} (Title: {post.title[:30]})")
112
+ print(f"REASON: Credibility Check (Account too new/Low Karma)")
113
+ print(f"---------------------------------------------------------\n")
114
+ return
115
+
116
+ # B. Logic Filter (First Defense) (Unchanged logic)
117
+ # runs simple keyword checks to filter news/financial/irrelevant content
118
  is_bad, reason = is_news_or_irrelevant(full_text)
119
+ if is_bad:
120
+ print(f"\n------------------- DEBUG REJECTION -------------------")
121
+ print(f"❌ REJECTED POST ID: {post.id} (Title: {post.title[:30]})")
122
+ print(f"REASON: Logic Filter (Common Sense Layer) Categorized as: {reason}")
123
+ print(f"---------------------------------------------------------\n")
124
+ return
125
 
126
+ # C. AI Analysis (Unchanged logic)
127
+ # runs the cascade AI check (TF-IDF then RoBERTa)
128
  is_urgent, score, source = predict_urgency(full_text)
129
+ if not is_urgent:
130
+ print(f"\n------------------- DEBUG REJECTION -------------------")
131
+ print(f"❌ REJECTED POST ID: {post.id} (Title: {post.title[:30]})")
132
+ print(f"REASON: AI Confidence too low Score: {score:.2%} (Source: {source})")
133
+ print(f"---------------------------------------------------------\n")
134
+ return
135
 
136
+ # D. Entity Extraction
137
+ # extracts location, contact number, and contact person name
138
+ ner_results = extract_entities(full_text)
139
+ locations = ner_results.get('locations', [])
140
+ contact_num = ner_results.get('contact', None)
141
+ contact_person_name = ner_results.get('contact_person_name', None)
142
+
143
+ # E. Final Triage and Data Preparation
144
+ # assigns location and determines disaster/assistance type
145
+ location = locations[0] if locations else "Unknown Location"
146
  disaster_type = get_disaster_type(full_text)
147
+ assistance_type = get_assistance_type(full_text)
148
+
149
+ # 1. Calculate Dynamic Urgency (NEW)
150
+ # assigns High, Medium, or Low urgency based on severity keywords
151
  dynamic_urgency = assign_dynamic_urgency(full_text)
152
 
153
+ # 2. Finalize Author (Fallback Logic)
154
+ # defaults to Reddit username if no contact name is explicitly extracted
155
+ reddit_username = str(post.author) if post.author else "Unknown"
156
+ final_author = contact_person_name if contact_person_name else reddit_username
157
+
158
+ # 3. Print Final Alert Confirmation
159
+ print(f"""------------------- ALERT SAVED -------------------\n🚨 ALERT ({score:.2%}): {disaster_type} in {location} Urgency: {dynamic_urgency} \n---------------------------------------------------------""")
160
 
161
+ # F. Single Database Creation and Commit
162
+ # creates and commits the final DisasterPost object to the database
163
  new_post = DisasterPost(
164
  reddit_id=post.id,
165
  title=post.title,
166
  content=post.selftext or post.title,
167
+ author=final_author,
168
  location=location,
169
+ contact_number=contact_num,
170
  disaster_type=disaster_type,
171
+ assistance_type=assistance_type,
172
  urgency_level=dynamic_urgency,
173
  is_help_request=True,
 
174
  timestamp=datetime.utcfromtimestamp(post.created_utc)
175
  )
176
 
 
179
  db.session.commit()
180
 
181
  except Exception as e:
182
+ print(f"Post Processing Error for {post.id}: {e}")
183
 
184
+ # validates if the extracted location is relevant to the Philippines
185
+ def check_for_philippine_location(location_list):
186
+ if not location_list: return False
187
+ ph_locations = [loc.lower() for loc in PHILIPPINE_LOCATIONS]
188
+ for extracted_loc in location_list:
189
+ # Check partial match (e.g., "Marikina City" matches "Marikina")
190
+ for known_loc in ph_locations:
191
+ if known_loc in extracted_loc.lower() or extracted_loc.lower() in known_loc:
192
+ return True
193
+ return False
194
+
195
+ # classifies the type of disaster based on severity keywords
196
+ def get_disaster_type(text):
197
+ text_lower = text.lower()
198
+ mapping = {
199
+ "Earthquake": ["quake", "lindol", "shake", "aftershock"],
200
+ "Landslide": ["landslide", "guho", "mudslide", "natabunan"],
201
+ "Volcano": ["volcano", "lava", "ash", "magma", "taal", "mayon"],
202
+ "Fire": ["fire", "sunog", "burn", "smoke"],
203
+ "Typhoon": ["typhoon", "bagyo", "storm", "wind", "signal", "ulysses", "odette"],
204
+ "Flood": ["flood", "baha", "water", "river", "drown", "lubog", "taas ng tubig"]
205
+ }
206
+
207
+ for dtype, keywords in mapping.items():
208
+ if any(k in text_lower for k in keywords):
209
+ return dtype
210
+ return "General Emergency"
211
+
212
+ # classifies the specific type of assistance needed (e.g., Medical, Rescue, Food)
213
+ def get_assistance_type(text):
214
+ """determines the specific help needed using Nested Priority"""
215
+ text = text.lower()
216
+
217
+ # --- 1. IMMEDIATE RESCUE (Life Threatening) ---
218
+ rescue_kw = [
219
+ "rescue", "saklolo", "trapped", "stuck", "stranded",
220
+ "bubong", "roof", "boat", "bangka", "drowning", "lunod",
221
+ "di makalabas", "unable to leave"
222
+ ]
223
+ if any(k in text for k in rescue_kw):
224
+
225
+ critical_medical_override_kw = [
226
+ "bleeding", "unconscious", "head injury", "head wound",
227
+ "severely bleeding", "stroke", "heart attack", "trauma"
228
+ ]
229
+ if any(k in text for k in critical_medical_override_kw):
230
+ return "Medical"
231
+
232
+ return "Rescue" # if no critical medical keywords found
233
+
234
+ # --- 2. MEDICAL (Specific Needs/Ambulance) ---
235
+ # handles standalone medical needs if no rescue keywords were found
236
+ medical_kw = [
237
+ "medical", "doctor", "gamot", "medicine", "insulin", "dialysis",
238
+ "hospital", "oxygen", "pregnant", "labor", "manganganak", "ambulance",
239
+ "first aid", "pills", "medication"
240
+ ]
241
+ if any(k in text for k in medical_kw):
242
+ return "Medical"
243
+
244
+ # --- 3. EVACUATION (Shelter/Transport) ---
245
+ # classifies the need for temporary shelter or transport
246
+ evac_kw = [
247
+ "evacuate", "evacuation", "shelter", "center", "likas", "tents",
248
+ "matutuluyan", "alis", "transportation", "walang matutuluyan"
249
+ ]
250
+ if any(k in text for k in evac_kw):
251
+ return "Evacuation"
252
+
253
+ # --- 4. FOOD & WATER (Logistics) ---
254
+ # classifies the need for essential supplies (food, water, formula)
255
+ food_kw = [
256
+ "food", "pagkain", "water", "tubig", "gutom", "hungry", "relief",
257
+ "goods", "makakain", "inumin", "groceries", "supplies", "supply", "wala ng stock",
258
+ "gatas", "milk", "formula", "baby supplies", "ubos na", "wala na", "stock", "stock ng"
259
+ ]
260
+ if any(k in text for k in food_kw):
261
+ return "Food/Water"
262
+
263
+ return "General Assistance"
264
+
265
+
266
+ # --- LOGIC FILTERS (The "Common Sense" Layer) ---
267
+ # runs simple logic checks to filter out news reports and non-urgent context
268
+ def is_news_or_irrelevant(text):
269
+ text_lower = text.lower()
270
+
271
+ # 1. NEWS & REPORTS
272
+ news_indicators = [
273
+ "breaking:", "just in:", "news:", "update:", "report:",
274
+ "casualties", "death toll", "according to", "reported that",
275
+ "suspension", "declared", "signal no", "public advisory",
276
+ "weather update", "volcano alert", "mmda", "pagasa"
277
+ ]
278
+
279
+ # 2. MONEY / SELLING
280
+ financial_indicators = [
281
+ "gcash", "paypal", "budget", "loan", "selling",
282
+ "fundraising", "donate", "send funds"
283
+ ]
284
+
285
+ # 3. IRRELEVANT CONTEXT
286
+ irrelevant_contexts = [
287
+ "how can i help", "where to donate", "thoughts and prayers",
288
+ "keep safe", "god bless", "praying for", "discussion:", "opinion:"
289
+ ]
290
+
291
+ # Logic Checks
292
+ if any(ind in text_lower for ind in news_indicators):
293
+ return True, "News/Report"
294
+
295
+ # blocks financial requests unless life-threatening keywords are also present
296
+ has_financial = any(ind in text_lower for ind in financial_indicators)
297
+ is_life_death = any(k in text_lower for k in ["trapped", "lubog", "roof", "rescue", "drowning", "stuck"])
298
+
299
+ if has_financial and not is_life_death:
300
+ return True, "Financial/Non-Urgent"
301
+
302
+ # blocks posts containing non-urgent discussion or commentary
303
+ if any(ctx in text_lower for ctx in irrelevant_contexts):
304
+ return True, "Context/NotUrgent"
305
+
306
+ return False, None
307
+
308
+ # runs the two-stage AI classification check (TF-IDF then RoBERTa)
309
+ def predict_urgency(text):
310
 
311
+ # 1. Gatekeeper (TF-IDF)
312
+ # quickly rejects posts with extremely low urgency confidence (below 10%)
313
+ if tfidf_model:
314
+ tfidf_probs = tfidf_model.predict_proba([text])[0]
315
+ tfidf_conf = tfidf_probs[1]
316
+
317
+ # If the fast model is sure it's junk, skip the heavy lifting
318
+ if tfidf_conf < 0.20:
319
+ return False, tfidf_conf, "TF-IDF Reject"
320
+
321
+ # 2. Context Expert (RoBERTa)
322
+ # runs the slower, context-aware model for final classification
323
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
324
+ with torch.no_grad():
325
+ outputs = roberta_model(**inputs)
326
+ probs = F.softmax(outputs.logits, dim=-1)
327
+ roberta_conf = probs[0][1].item() # Probability of 'Rescue Request'
328
+
329
+ # final acceptance threshold (40%) for the RoBERTa model
330
+ return (roberta_conf > 0.4), roberta_conf, "RoBERTa"
331
+
332
+ # assigns the final severity level (High, Medium, Low) based on severity keywords
333
+ def assign_dynamic_urgency(text):
334
+ text_lower = text.lower()
335
+
336
+ # 1. HIGH URGENCY (Immediate Life-Threatening Event or Critical Medical Need)
337
+ high_keywords = [
338
+ "bleeding", "unconscious", "severely injured", "severe injury", "life threatening",
339
+ "insulin", "oxygen", "ambulance", "urgent medicine", "doctor", "hospital",
340
+
341
+ "trap", "trapped", "bubong", "collapsed", "di mapigilan", "drowning",
342
+ "lampas tao", "lubog", "delikado", "baha na", "mamatay"
343
+ ]
344
+ if any(k in text_lower for k in high_keywords):
345
+ return "High"
346
+
347
+ # 2. MEDIUM URGENCY (Time-Sensitive, Logistical Crisis)
348
+ medium_keywords = [
349
+ "stranded", "running out", "evacuate", "kailangan agad", "lowbat",
350
+ "paubos", "senior", "bedridden", "disabled", "gatas", "formula"
351
+ ]
352
+ if any(k in text_lower for k in medium_keywords):
353
+ return "Medium"
354
+
355
+ # 3. LOW URGENCY (General Supplies/Warning)
356
+ # posts that pass the AI but lack the above severity indicators fall here
357
+ return "Low"
358
+
359
+ # blocks posts from accounts created less than 2 days ago or with negative karma
360
+ def is_credible_user(post):
361
+ try:
362
+ author = post.author
363
+
364
+ # checks if author is deleted or unknown
365
+ if not author:
366
+ return False
367
+
368
+ # 1. Check Account Age (Must be older than 2 days)
369
+ created_time = datetime.utcfromtimestamp(author.created_utc)
370
+ account_age = datetime.utcnow() - created_time
371
+
372
+ if account_age.days < 2:
373
+ print(f" ⚠️ Blocked: Account too new ({account_age.days} days)")
374
+ return False
375
+
376
+ # 2. Check Karma (Must not be negative)
377
+ total_karma = author.comment_karma + author.link_karma
378
+ if total_karma < -5:
379
+ print(f" ⚠️ Blocked: Negative Karma ({total_karma})")
380
+ return False
381
+
382
+ return True
383
+
384
+ except Exception as e:
385
+ # allows posts to pass if Reddit API fails to get user info
386
+ return True
387
+
388
+ # 4. Main Scraper Loop
389
+
390
+ async def scrape_reddit():
391
+ print("Connecting to Reddit API...")
392
+
393
  client_id = os.getenv("REDDIT_CLIENT_ID")
394
  client_secret = os.getenv("REDDIT_CLIENT_SECRET")
395
+ # --------------------------------
396
+
397
+ if not client_id or not client_secret:
398
+ print("❌ Error: Client ID or Secret missing in .env")
399
+ return
400
+
401
  reddit = asyncpraw.Reddit(
402
+ client_id=os.getenv("REDDIT_CLIENT_ID"),
403
+ client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
404
  user_agent=os.getenv("REDDIT_USER_AGENT"),
405
  username=os.getenv("REDDIT_USERNAME"),
406
  password=os.getenv("REDDIT_PASSWORD")
407
+ )
 
 
 
 
 
 
 
 
 
 
 
408
 
409
+ try:
410
+ subreddit = await reddit.subreddit(SUBREDDITS)
411
+ print(f"👁️  ALISTO ACTIVE: Monitoring r/{SUBREDDITS}...")
412
+
413
+ # --- PHASE 1: FETCH LATEST EXISTING POSTS (e.g., last 500) ---
414
+ print("🔍 Scanning last 5 posts for missed alerts...")
415
+ # iterates over the last 5 posts asynchronously
416
+ async for post in subreddit.new(limit=5):
417
+ await process_post(post)
418
+
419
+ print("✅ Historical scan complete")
420
+
421
+ # --- PHASE 2: START REAL-TIME STREAM (Forever Loop) ---
422
+ print("📡 Starting real-time stream for new submissions...")
423
+
424
+ # starts the continuous loop to monitor for new submissions
425
+ async for post in subreddit.stream.submissions(skip_existing=False):
426
+ await process_post(post)
427
 
428
+ except Exception as e:
429
+ print(f"Global Scraper Error: {e}")
430
+ finally:
431
+ await reddit.close()
432
+ print("Scraper stopped")
 
 
 
 
433
 
 
 
 
434
 
 
435
 
436
+ # executes the main scraping loop when the script is run
437
  if __name__ == "__main__":
438
  try:
439
  loop = asyncio.new_event_loop()
440
  asyncio.set_event_loop(loop)
441
  loop.run_until_complete(scrape_reddit())
442
  except KeyboardInterrupt:
443
+ print("\n🛑 Stopped by user")