Quivara commited on
Commit
64f8d28
Β·
verified Β·
1 Parent(s): 42deb05

Update alisto_project/backend/ingest_reddit.py

Browse files
alisto_project/backend/ingest_reddit.py CHANGED
@@ -3,8 +3,7 @@ import asyncio
3
  import os
4
  import torch
5
  import pickle
6
- import numpy as np
7
- import torch.nn.functional as F
8
  from datetime import datetime
9
  from dotenv import load_dotenv
10
  from flask import Flask
@@ -13,164 +12,125 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
13
  from ner_extractor import extract_entities
14
  from huggingface_hub import hf_hub_download
15
 
 
 
 
 
 
 
16
  # 1. Config & Setup
17
- # defines the subreddits to be monitored by the scraper
18
  SUBREDDITS = "AlistoSimulation"
19
- # SUBREDDITS = "Philippines+NaturalDisasters+DisasterUpdatePH+Assistance+Typhoon+AlistoSimulation"
20
-
21
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
22
- # loads environment variables from .env file
23
- env_path_1 = os.path.join(BASE_DIR, '../.env') # Inside alisto_project
24
- env_path_2 = os.path.join(BASE_DIR, '../../.env') # In the main root
25
 
 
 
26
  if os.path.exists(env_path_1):
27
  load_dotenv(env_path_1)
28
- print("βœ… Loaded .env from alisto_project folder")
29
- elif os.path.exists(env_path_2):
30
- load_dotenv(env_path_2)
31
- print("βœ… Loaded .env from Root folder")
32
  else:
33
- print("⚠️ WARNING: No .env file found! Passwords will be missing.")
34
 
35
- # initializes the Flask application context for database access
36
  app = Flask(__name__)
37
  DB_PATH = os.path.join(BASE_DIR, 'alisto.db')
38
  app.config['SQLALCHEMY_DATABASE_URI'] = f'sqlite:///{DB_PATH}'
39
  app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
40
- # sets a timeout for stable database connection
41
  app.config['SQLALCHEMY_ENGINE_OPTIONS'] = {'connect_args': {'timeout': 15}}
42
  db.init_app(app)
43
 
44
  # 2. Load Models
45
- print("Loading ALISTO Brains from Cloud...")
46
-
47
  MODEL_ID = "Quivara/alisto-brain"
48
 
49
  try:
50
- # Load Tokenizer (Add subfolder argument)
51
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, subfolder="roberta_model")
52
-
53
- # Load Model (Add subfolder argument)
54
  roberta_model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, subfolder="roberta_model", num_labels=2)
55
-
56
  device = torch.device("cpu")
57
  roberta_model.to(device)
58
  roberta_model.eval()
59
- print(f"βœ… Context Expert loaded from {MODEL_ID} (roberta_model folder)")
60
-
61
  except Exception as e:
62
- print(f"❌ Error loading Model: {e}")
63
- # Emergency Fallback to generic model so app doesn't crash
64
- exit()
65
-
66
 
67
- # B. TF-IDF (The Gatekeeper)
68
  try:
69
- print("Downloading Gatekeeper (TF-IDF)...")
70
- # TF-IDF is likely in the root, so no subfolder needed
71
  tfidf_path = hf_hub_download(repo_id=MODEL_ID, filename="tfidf_ensemble.pkl")
72
-
73
  with open(tfidf_path, 'rb') as f:
74
  tfidf_model = pickle.load(f)
75
- print("βœ… Gatekeeper (TF-IDF) loaded")
76
  except Exception as e:
77
- print(f"❌ Error loading TF-IDF: {e}")
78
  tfidf_model = None
79
 
80
- # 3. Reference Lists (Kept from your original)
81
- # list of Philippine locations used for basic geo-validation
82
- PHILIPPINE_LOCATIONS = [
83
- "Philippines", "PH", "Luzon", "Visayas", "Mindanao", "Metro Manila", "NCR",
84
- "Manila", "Quezon City", "Makati", "Taguig", "Pasig", "Mandaluyong",
85
- "Marikina", "Las Pinas", "Las PiΓ±as", "Muntinlupa", "Caloocan",
86
- "Paranaque", "ParaΓ±aque", "Valenzuela", "Pasay", "Malabon",
87
- "Navotas", "San Juan", "Pateros",
88
- "Cavite", "Naic", "Bacoor", "Imus", "Dasmarinas", "DasmariΓ±as",
89
- "General Trias", "Tagaytay", "Kawit", "Noveleta", "Rosario", "Tanza",
90
- "Silang", "Trece Martires", "Laguna", "Calamba", "Santa Rosa", "Binan",
91
- "BiΓ±an", "San Pedro", "Cabuyao", "Los Banos", "Los BaΓ±os", "Rizal",
92
- "Antipolo", "Cainta", "Taytay", "San Mateo", "Binangonan", "Batangas",
93
- "Bulacan", "Pampanga", "Tarlac", "Cebu", "Iloilo", "Tacloban",
94
- "Davao", "Cagayan", "Bicol", "Albay", "Isabela"
95
- ]
96
 
97
- # function to process a single Reddit submission through all filters and save it
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  async def process_post(post):
99
- """handles logic for a single Reddit submission (filtering, AI, saving)"""
100
  try:
101
  full_text = f"{post.title} {post.selftext}"
102
 
103
- # A. Check for Duplicates & Credibility (Unchanged logic)
104
- # checks for existing post ID in the database
105
  with app.app_context():
106
  exists = DisasterPost.query.filter_by(reddit_id=post.id).first()
107
  if exists: return
108
- # blocks posts from suspicious new/low-karma accounts
109
- if not is_credible_user(post):
110
- print(f"\n------------------- DEBUG REJECTION -------------------")
111
- print(f"❌ REJECTED POST ID: {post.id} (Title: {post.title[:30]})")
112
- print(f"REASON: Credibility Check (Account too new/Low Karma)")
113
- print(f"---------------------------------------------------------\n")
114
- return
115
-
116
- # B. Logic Filter (First Defense) (Unchanged logic)
117
- # runs simple keyword checks to filter news/financial/irrelevant content
118
  is_bad, reason = is_news_or_irrelevant(full_text)
119
- if is_bad:
120
- print(f"\n------------------- DEBUG REJECTION -------------------")
121
- print(f"❌ REJECTED POST ID: {post.id} (Title: {post.title[:30]})")
122
- print(f"REASON: Logic Filter (Common Sense Layer) Categorized as: {reason}")
123
- print(f"---------------------------------------------------------\n")
124
- return
125
 
126
- # C. AI Analysis (Unchanged logic)
127
- # runs the cascade AI check (TF-IDF then RoBERTa)
128
  is_urgent, score, source = predict_urgency(full_text)
129
- if not is_urgent:
130
- print(f"\n------------------- DEBUG REJECTION -------------------")
131
- print(f"❌ REJECTED POST ID: {post.id} (Title: {post.title[:30]})")
132
- print(f"REASON: AI Confidence too low Score: {score:.2%} (Source: {source})")
133
- print(f"---------------------------------------------------------\n")
134
- return
135
 
136
- # D. Entity Extraction
137
- # extracts location, contact number, and contact person name
138
- ner_results = extract_entities(full_text)
139
- locations = ner_results.get('locations', [])
140
- contact_num = ner_results.get('contact', None)
141
- contact_person_name = ner_results.get('contact_person_name', None)
142
-
143
- # E. Final Triage and Data Preparation
144
- # assigns location and determines disaster/assistance type
145
- location = locations[0] if locations else "Unknown Location"
146
  disaster_type = get_disaster_type(full_text)
147
- assistance_type = get_assistance_type(full_text)
148
-
149
- # 1. Calculate Dynamic Urgency (NEW)
150
- # assigns High, Medium, or Low urgency based on severity keywords
151
  dynamic_urgency = assign_dynamic_urgency(full_text)
152
 
153
- # 2. Finalize Author (Fallback Logic)
154
- # defaults to Reddit username if no contact name is explicitly extracted
155
- reddit_username = str(post.author) if post.author else "Unknown"
156
- final_author = contact_person_name if contact_person_name else reddit_username
157
-
158
- # 3. Print Final Alert Confirmation
159
- print(f"""------------------- ALERT SAVED -------------------\n🚨 ALERT ({score:.2%}): {disaster_type} in {location} Urgency: {dynamic_urgency} \n---------------------------------------------------------""")
160
 
161
- # F. Single Database Creation and Commit
162
- # creates and commits the final DisasterPost object to the database
163
  new_post = DisasterPost(
164
  reddit_id=post.id,
165
  title=post.title,
166
  content=post.selftext or post.title,
167
- author=final_author,
168
  location=location,
169
- contact_number=contact_num,
170
  disaster_type=disaster_type,
171
- assistance_type=assistance_type,
172
  urgency_level=dynamic_urgency,
173
  is_help_request=True,
 
174
  timestamp=datetime.utcfromtimestamp(post.created_utc)
175
  )
176
 
@@ -179,224 +139,19 @@ async def process_post(post):
179
  db.session.commit()
180
 
181
  except Exception as e:
182
- print(f"Post Processing Error for {post.id}: {e}")
183
-
184
- # validates if the extracted location is relevant to the Philippines
185
- def check_for_philippine_location(location_list):
186
- if not location_list: return False
187
- ph_locations = [loc.lower() for loc in PHILIPPINE_LOCATIONS]
188
- for extracted_loc in location_list:
189
- # Check partial match (e.g., "Marikina City" matches "Marikina")
190
- for known_loc in ph_locations:
191
- if known_loc in extracted_loc.lower() or extracted_loc.lower() in known_loc:
192
- return True
193
- return False
194
-
195
- # classifies the type of disaster based on severity keywords
196
- def get_disaster_type(text):
197
- text_lower = text.lower()
198
- mapping = {
199
- "Earthquake": ["quake", "lindol", "shake", "aftershock"],
200
- "Landslide": ["landslide", "guho", "mudslide", "natabunan"],
201
- "Volcano": ["volcano", "lava", "ash", "magma", "taal", "mayon"],
202
- "Fire": ["fire", "sunog", "burn", "smoke"],
203
- "Typhoon": ["typhoon", "bagyo", "storm", "wind", "signal", "ulysses", "odette"],
204
- "Flood": ["flood", "baha", "water", "river", "drown", "lubog", "taas ng tubig"]
205
- }
206
-
207
- for dtype, keywords in mapping.items():
208
- if any(k in text_lower for k in keywords):
209
- return dtype
210
- return "General Emergency"
211
-
212
- # classifies the specific type of assistance needed (e.g., Medical, Rescue, Food)
213
- def get_assistance_type(text):
214
- """determines the specific help needed using Nested Priority"""
215
- text = text.lower()
216
-
217
- # --- 1. IMMEDIATE RESCUE (Life Threatening) ---
218
- rescue_kw = [
219
- "rescue", "saklolo", "trapped", "stuck", "stranded",
220
- "bubong", "roof", "boat", "bangka", "drowning", "lunod",
221
- "di makalabas", "unable to leave"
222
- ]
223
- if any(k in text for k in rescue_kw):
224
-
225
- critical_medical_override_kw = [
226
- "bleeding", "unconscious", "head injury", "head wound",
227
- "severely bleeding", "stroke", "heart attack", "trauma"
228
- ]
229
- if any(k in text for k in critical_medical_override_kw):
230
- return "Medical"
231
-
232
- return "Rescue" # if no critical medical keywords found
233
-
234
- # --- 2. MEDICAL (Specific Needs/Ambulance) ---
235
- # handles standalone medical needs if no rescue keywords were found
236
- medical_kw = [
237
- "medical", "doctor", "gamot", "medicine", "insulin", "dialysis",
238
- "hospital", "oxygen", "pregnant", "labor", "manganganak", "ambulance",
239
- "first aid", "pills", "medication"
240
- ]
241
- if any(k in text for k in medical_kw):
242
- return "Medical"
243
-
244
- # --- 3. EVACUATION (Shelter/Transport) ---
245
- # classifies the need for temporary shelter or transport
246
- evac_kw = [
247
- "evacuate", "evacuation", "shelter", "center", "likas", "tents",
248
- "matutuluyan", "alis", "transportation", "walang matutuluyan"
249
- ]
250
- if any(k in text for k in evac_kw):
251
- return "Evacuation"
252
-
253
- # --- 4. FOOD & WATER (Logistics) ---
254
- # classifies the need for essential supplies (food, water, formula)
255
- food_kw = [
256
- "food", "pagkain", "water", "tubig", "gutom", "hungry", "relief",
257
- "goods", "makakain", "inumin", "groceries", "supplies", "supply", "wala ng stock",
258
- "gatas", "milk", "formula", "baby supplies", "ubos na", "wala na", "stock", "stock ng"
259
- ]
260
- if any(k in text for k in food_kw):
261
- return "Food/Water"
262
-
263
- return "General Assistance"
264
-
265
-
266
- # --- LOGIC FILTERS (The "Common Sense" Layer) ---
267
- # runs simple logic checks to filter out news reports and non-urgent context
268
- def is_news_or_irrelevant(text):
269
- text_lower = text.lower()
270
-
271
- # 1. NEWS & REPORTS
272
- news_indicators = [
273
- "breaking:", "just in:", "news:", "update:", "report:",
274
- "casualties", "death toll", "according to", "reported that",
275
- "suspension", "declared", "signal no", "public advisory",
276
- "weather update", "volcano alert", "mmda", "pagasa"
277
- ]
278
-
279
- # 2. MONEY / SELLING
280
- financial_indicators = [
281
- "gcash", "paypal", "budget", "loan", "selling",
282
- "fundraising", "donate", "send funds"
283
- ]
284
-
285
- # 3. IRRELEVANT CONTEXT
286
- irrelevant_contexts = [
287
- "how can i help", "where to donate", "thoughts and prayers",
288
- "keep safe", "god bless", "praying for", "discussion:", "opinion:"
289
- ]
290
-
291
- # Logic Checks
292
- if any(ind in text_lower for ind in news_indicators):
293
- return True, "News/Report"
294
-
295
- # blocks financial requests unless life-threatening keywords are also present
296
- has_financial = any(ind in text_lower for ind in financial_indicators)
297
- is_life_death = any(k in text_lower for k in ["trapped", "lubog", "roof", "rescue", "drowning", "stuck"])
298
-
299
- if has_financial and not is_life_death:
300
- return True, "Financial/Non-Urgent"
301
-
302
- # blocks posts containing non-urgent discussion or commentary
303
- if any(ctx in text_lower for ctx in irrelevant_contexts):
304
- return True, "Context/NotUrgent"
305
-
306
- return False, None
307
-
308
- # runs the two-stage AI classification check (TF-IDF then RoBERTa)
309
- def predict_urgency(text):
310
-
311
- # 1. Gatekeeper (TF-IDF)
312
- # quickly rejects posts with extremely low urgency confidence (below 10%)
313
- if tfidf_model:
314
- tfidf_probs = tfidf_model.predict_proba([text])[0]
315
- tfidf_conf = tfidf_probs[1]
316
-
317
- # If the fast model is sure it's junk, skip the heavy lifting
318
- if tfidf_conf < 0.20:
319
- return False, tfidf_conf, "TF-IDF Reject"
320
-
321
- # 2. Context Expert (RoBERTa)
322
- # runs the slower, context-aware model for final classification
323
- inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
324
- with torch.no_grad():
325
- outputs = roberta_model(**inputs)
326
- probs = F.softmax(outputs.logits, dim=-1)
327
- roberta_conf = probs[0][1].item() # Probability of 'Rescue Request'
328
-
329
- # final acceptance threshold (40%) for the RoBERTa model
330
- return (roberta_conf > 0.4), roberta_conf, "RoBERTa"
331
-
332
- # assigns the final severity level (High, Medium, Low) based on severity keywords
333
- def assign_dynamic_urgency(text):
334
- text_lower = text.lower()
335
-
336
- # 1. HIGH URGENCY (Immediate Life-Threatening Event or Critical Medical Need)
337
- high_keywords = [
338
- "bleeding", "unconscious", "severely injured", "severe injury", "life threatening",
339
- "insulin", "oxygen", "ambulance", "urgent medicine", "doctor", "hospital",
340
-
341
- "trap", "trapped", "bubong", "collapsed", "di mapigilan", "drowning",
342
- "lampas tao", "lubog", "delikado", "baha na", "mamatay"
343
- ]
344
- if any(k in text_lower for k in high_keywords):
345
- return "High"
346
-
347
- # 2. MEDIUM URGENCY (Time-Sensitive, Logistical Crisis)
348
- medium_keywords = [
349
- "stranded", "running out", "evacuate", "kailangan agad", "lowbat",
350
- "paubos", "senior", "bedridden", "disabled", "gatas", "formula"
351
- ]
352
- if any(k in text_lower for k in medium_keywords):
353
- return "Medium"
354
-
355
- # 3. LOW URGENCY (General Supplies/Warning)
356
- # posts that pass the AI but lack the above severity indicators fall here
357
- return "Low"
358
-
359
- # blocks posts from accounts created less than 2 days ago or with negative karma
360
- def is_credible_user(post):
361
- try:
362
- author = post.author
363
-
364
- # checks if author is deleted or unknown
365
- if not author:
366
- return False
367
-
368
- # 1. Check Account Age (Must be older than 2 days)
369
- created_time = datetime.utcfromtimestamp(author.created_utc)
370
- account_age = datetime.utcnow() - created_time
371
-
372
- if account_age.days < 2:
373
- print(f"   ⚠️ Blocked: Account too new ({account_age.days} days)")
374
- return False
375
-
376
- # 2. Check Karma (Must not be negative)
377
- total_karma = author.comment_karma + author.link_karma
378
- if total_karma < -5:
379
- print(f"   ⚠️ Blocked: Negative Karma ({total_karma})")
380
- return False
381
-
382
- return True
383
-
384
- except Exception as e:
385
- # allows posts to pass if Reddit API fails to get user info
386
- return True
387
 
388
- # 4. Main Scraper Loop (POLLING MODE - BYPASSES CLOUD BLOCK)
389
  async def scrape_reddit():
390
- print("Connecting to Reddit API (Polling Mode)...")
391
 
392
- # Load credentials
393
  client_id = os.getenv("REDDIT_CLIENT_ID")
394
  client_secret = os.getenv("REDDIT_CLIENT_SECRET")
395
-
396
  if not client_id or not client_secret:
397
- print("❌ Error: Client ID or Secret missing in .env")
398
  return
399
-
400
  reddit = asyncpraw.Reddit(
401
  client_id=client_id,
402
  client_secret=client_secret,
@@ -405,40 +160,36 @@ async def scrape_reddit():
405
  password=os.getenv("REDDIT_PASSWORD")
406
  )
407
 
408
- print(f"πŸ‘οΈ ALISTO ACTIVE: Polling r/{SUBREDDITS} every 60 seconds...")
409
-
410
- # Keep track of the last post we saw so we don't duplicate
411
- last_processed_id = None
412
 
413
  while True:
414
  try:
415
  subreddit = await reddit.subreddit(SUBREDDITS)
416
 
417
- # Fetch ONLY the single newest post
418
  async for post in subreddit.new(limit=1):
419
- if post.id != last_processed_id:
420
- print(f"πŸ“₯ New Post Detected: {post.title}")
421
  await process_post(post)
422
- last_processed_id = post.id
423
  else:
424
- print(" (No new posts, waiting...)")
 
425
 
426
- # Disconnect and sleep for 60 seconds (This prevents the 403 Ban)
427
  await asyncio.sleep(60)
428
 
429
  except Exception as e:
430
- print(f"⚠️ Connection glitch: {e}")
431
- print(" Waiting 2 minutes before retry...")
432
  await asyncio.sleep(120)
433
 
434
- # Note: We technically never reach this, but good practice to close
435
  await reddit.close()
436
 
437
- # executes the main scraping loop when the script is run
438
  if __name__ == "__main__":
439
  try:
440
  loop = asyncio.new_event_loop()
441
  asyncio.set_event_loop(loop)
442
  loop.run_until_complete(scrape_reddit())
443
  except KeyboardInterrupt:
444
- print("\nπŸ›‘ Stopped by user")
 
3
  import os
4
  import torch
5
  import pickle
6
+ import sys
 
7
  from datetime import datetime
8
  from dotenv import load_dotenv
9
  from flask import Flask
 
12
  from ner_extractor import extract_entities
13
  from huggingface_hub import hf_hub_download
14
 
15
+ # Force prints to appear immediately in Hugging Face logs
16
+ def log(msg):
17
+ print(msg, flush=True)
18
+
19
+ log("πŸš€ INGEST SCRIPT LAUNCHED! Initializing...")
20
+
21
  # 1. Config & Setup
 
22
  SUBREDDITS = "AlistoSimulation"
 
 
23
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 
 
 
24
 
25
+ # Load .env (Try multiple locations)
26
+ env_path_1 = os.path.join(BASE_DIR, '../.env')
27
  if os.path.exists(env_path_1):
28
  load_dotenv(env_path_1)
29
+ log("βœ… Loaded .env from alisto_project folder")
 
 
 
30
  else:
31
+ log("⚠️ No .env file found in alisto_project folder")
32
 
 
33
  app = Flask(__name__)
34
  DB_PATH = os.path.join(BASE_DIR, 'alisto.db')
35
  app.config['SQLALCHEMY_DATABASE_URI'] = f'sqlite:///{DB_PATH}'
36
  app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
 
37
  app.config['SQLALCHEMY_ENGINE_OPTIONS'] = {'connect_args': {'timeout': 15}}
38
  db.init_app(app)
39
 
40
  # 2. Load Models
41
+ log("🧠 Loading ALISTO Brains from Cloud (This takes 1-2 mins)...")
 
42
  MODEL_ID = "Quivara/alisto-brain"
43
 
44
  try:
 
45
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, subfolder="roberta_model")
 
 
46
  roberta_model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, subfolder="roberta_model", num_labels=2)
 
47
  device = torch.device("cpu")
48
  roberta_model.to(device)
49
  roberta_model.eval()
50
+ log(f"βœ… Context Expert loaded from {MODEL_ID}")
 
51
  except Exception as e:
52
+ log(f"❌ Error loading Model: {e}")
 
 
 
53
 
 
54
  try:
55
+ log("πŸ“₯ Downloading Gatekeeper (TF-IDF)...")
 
56
  tfidf_path = hf_hub_download(repo_id=MODEL_ID, filename="tfidf_ensemble.pkl")
 
57
  with open(tfidf_path, 'rb') as f:
58
  tfidf_model = pickle.load(f)
59
+ log("βœ… Gatekeeper (TF-IDF) loaded")
60
  except Exception as e:
61
+ log(f"❌ Error loading TF-IDF (Ignore warnings): {e}")
62
  tfidf_model = None
63
 
64
+ # 3. Helpers
65
+ def is_news_or_irrelevant(text):
66
+ text = text.lower()
67
+ if any(x in text for x in ["breaking:", "news:", "update:", "selling", "donate"]): return True, "Irrelevant"
68
+ return False, None
 
 
 
 
 
 
 
 
 
 
 
69
 
70
+ def predict_urgency(text):
71
+ if tfidf_model and tfidf_model.predict_proba([text])[0][1] < 0.20: return False, 0, "TF-IDF"
72
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
73
+ with torch.no_grad():
74
+ score = F.softmax(roberta_model(**inputs).logits, dim=-1)[0][1].item()
75
+ return (score > 0.4), score, "RoBERTa"
76
+
77
+ def get_disaster_type(text):
78
+ text = text.lower()
79
+ if any(x in text for x in ["flood", "baha", "water"]): return "Flood"
80
+ if any(x in text for x in ["fire", "sunog"]): return "Fire"
81
+ if any(x in text for x in ["quake", "lindol"]): return "Earthquake"
82
+ return "General Emergency"
83
+
84
+ def get_assistance_type(text):
85
+ text = text.lower()
86
+ if any(x in text for x in ["rescue", "roof", "trapped"]): return "Rescue"
87
+ if any(x in text for x in ["medical", "doctor"]): return "Medical"
88
+ return "General Assistance"
89
+
90
+ def assign_dynamic_urgency(text):
91
+ text = text.lower()
92
+ if any(x in text for x in ["trapped", "bleeding", "drowning"]): return "High"
93
+ if "stranded" in text: return "Medium"
94
+ return "Low"
95
+
96
+ def extract_entities_wrapper(text):
97
+ res = extract_entities(text)
98
+ loc = res.get('locations', ["Unknown"])[0] if res.get('locations') else "Unknown Location"
99
+ return loc, res.get('contact')
100
+
101
+ # 4. Processing Logic
102
  async def process_post(post):
 
103
  try:
104
  full_text = f"{post.title} {post.selftext}"
105
 
 
 
106
  with app.app_context():
107
  exists = DisasterPost.query.filter_by(reddit_id=post.id).first()
108
  if exists: return
109
+
 
 
 
 
 
 
 
 
 
110
  is_bad, reason = is_news_or_irrelevant(full_text)
111
+ if is_bad: return
 
 
 
 
 
112
 
 
 
113
  is_urgent, score, source = predict_urgency(full_text)
114
+ if not is_urgent: return
 
 
 
 
 
115
 
116
+ location, contact = extract_entities_wrapper(full_text)
 
 
 
 
 
 
 
 
 
117
  disaster_type = get_disaster_type(full_text)
 
 
 
 
118
  dynamic_urgency = assign_dynamic_urgency(full_text)
119
 
120
+ log(f"🚨 ALERT SAVED: {disaster_type} in {location} ({dynamic_urgency})")
 
 
 
 
 
 
121
 
 
 
122
  new_post = DisasterPost(
123
  reddit_id=post.id,
124
  title=post.title,
125
  content=post.selftext or post.title,
126
+ author=str(post.author),
127
  location=location,
128
+ contact_number=contact,
129
  disaster_type=disaster_type,
130
+ assistance_type=get_assistance_type(full_text),
131
  urgency_level=dynamic_urgency,
132
  is_help_request=True,
133
+ status='New',
134
  timestamp=datetime.utcfromtimestamp(post.created_utc)
135
  )
136
 
 
139
  db.session.commit()
140
 
141
  except Exception as e:
142
+ log(f"Processing Error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
+ # 5. Main Loop (POLLING MODE)
145
  async def scrape_reddit():
146
+ log("πŸ”Œ Connecting to Reddit API (Polling Mode)...")
147
 
 
148
  client_id = os.getenv("REDDIT_CLIENT_ID")
149
  client_secret = os.getenv("REDDIT_CLIENT_SECRET")
150
+
151
  if not client_id or not client_secret:
152
+ log("❌ CRITICAL ERROR: Client ID or Secret missing in .env")
153
  return
154
+
155
  reddit = asyncpraw.Reddit(
156
  client_id=client_id,
157
  client_secret=client_secret,
 
160
  password=os.getenv("REDDIT_PASSWORD")
161
  )
162
 
163
+ log(f"πŸ‘οΈ ALISTO ACTIVE: Polling r/{SUBREDDITS} every 60s...")
164
+ last_id = None
 
 
165
 
166
  while True:
167
  try:
168
  subreddit = await reddit.subreddit(SUBREDDITS)
169
 
170
+ # Fetch ONLY 1 post to minimize bandwidth
171
  async for post in subreddit.new(limit=1):
172
+ if post.id != last_id:
173
+ log(f"πŸ“₯ New Post Detected: {post.title}")
174
  await process_post(post)
175
+ last_id = post.id
176
  else:
177
+ # Silence "no new post" messages to keep logs clean
178
+ pass
179
 
180
+ # Wait 60 seconds (The Fix)
181
  await asyncio.sleep(60)
182
 
183
  except Exception as e:
184
+ log(f"⚠️ Connection glitch (Retrying in 2m): {e}")
 
185
  await asyncio.sleep(120)
186
 
 
187
  await reddit.close()
188
 
 
189
  if __name__ == "__main__":
190
  try:
191
  loop = asyncio.new_event_loop()
192
  asyncio.set_event_loop(loop)
193
  loop.run_until_complete(scrape_reddit())
194
  except KeyboardInterrupt:
195
+ log("\nπŸ›‘ Stopped by user")