Quivara commited on
Commit
cb1e7e4
Β·
verified Β·
1 Parent(s): 0c063e9

Update alisto_project/backend/ingest_reddit.py

Browse files
alisto_project/backend/ingest_reddit.py CHANGED
@@ -11,6 +11,7 @@ from flask import Flask
11
  from models import db, DisasterPost
12
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
13
  from ner_extractor import extract_entities
 
14
 
15
  # 1. Config & Setup
16
  # defines the subreddits to be monitored by the scraper
@@ -31,28 +32,33 @@ app.config['SQLALCHEMY_ENGINE_OPTIONS'] = {'connect_args': {'timeout': 15}}
31
  db.init_app(app)
32
 
33
  # 2. Load Models
34
- print("Loading ALISTO Brains...")
35
- MODEL_DIR = os.path.join(BASE_DIR, 'models')
36
- ROBERTA_DIR = os.path.join(MODEL_DIR, 'roberta_model')
37
- TFIDF_PATH = os.path.join(MODEL_DIR, 'tfidf_ensemble.pkl')
38
 
39
- # A. RoBERTa (XLM-R Multilingual)
40
- # loads the RoBERTa tokenizer and sequence classification model (Context Expert)
41
  try:
42
- tokenizer = AutoTokenizer.from_pretrained(ROBERTA_DIR)
43
- roberta_model = AutoModelForSequenceClassification.from_pretrained(ROBERTA_DIR)
44
- device = torch.device("cpu") # determines the device (CPU/GPU) for model execution
 
 
 
 
45
  roberta_model.to(device)
46
  roberta_model.eval()
47
- print("βœ… Context Expert (XLM-R) loaded")
 
48
  except Exception as e:
49
- print(f"❌ Error loading RoBERTa: {e}")
50
  exit()
51
 
52
  # B. TF-IDF (The Gatekeeper)
53
- # loads the pre-trained TF-IDF vectorizer and ensemble model (Gatekeeper)
54
  try:
55
- with open(TFIDF_PATH, 'rb') as f:
 
 
 
 
56
  tfidf_model = pickle.load(f)
57
  print("βœ… Gatekeeper (TF-IDF) loaded")
58
  except Exception as e:
 
11
  from models import db, DisasterPost
12
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
13
  from ner_extractor import extract_entities
14
+ from huggingface_hub import hf_hub_download
15
 
16
  # 1. Config & Setup
17
  # defines the subreddits to be monitored by the scraper
 
32
  db.init_app(app)
33
 
34
  # 2. Load Models
35
+ print("Loading ALISTO Brains from Cloud...")
36
+ # Point to your new Model Repository
37
+ MODEL_ID = "Quivara/alisto-brain"
 
38
 
 
 
39
  try:
40
+ # Load Tokenizer
41
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
42
+
43
+ # Load Model (Num labels must match your training, usually 2 for urgent/not urgent)
44
+ roberta_model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, num_labels=2)
45
+
46
+ device = torch.device("cpu")
47
  roberta_model.to(device)
48
  roberta_model.eval()
49
+ print(f"βœ… Context Expert loaded from {MODEL_ID}")
50
+
51
  except Exception as e:
52
+ print(f"❌ Error loading Model: {e}")
53
  exit()
54
 
55
  # B. TF-IDF (The Gatekeeper)
 
56
  try:
57
+ print("Downloading Gatekeeper (TF-IDF)...")
58
+ # Downloads the file from your alisto-brain repo to a cache folder
59
+ tfidf_path = hf_hub_download(repo_id=MODEL_ID, filename="tfidf_ensemble.pkl")
60
+
61
+ with open(tfidf_path, 'rb') as f:
62
  tfidf_model = pickle.load(f)
63
  print("βœ… Gatekeeper (TF-IDF) loaded")
64
  except Exception as e: