Spaces:
Running
Running
Update alisto_project/backend/ingest_reddit.py
Browse files
alisto_project/backend/ingest_reddit.py
CHANGED
|
@@ -11,6 +11,7 @@ from flask import Flask
|
|
| 11 |
from models import db, DisasterPost
|
| 12 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 13 |
from ner_extractor import extract_entities
|
|
|
|
| 14 |
|
| 15 |
# 1. Config & Setup
|
| 16 |
# defines the subreddits to be monitored by the scraper
|
|
@@ -31,28 +32,33 @@ app.config['SQLALCHEMY_ENGINE_OPTIONS'] = {'connect_args': {'timeout': 15}}
|
|
| 31 |
db.init_app(app)
|
| 32 |
|
| 33 |
# 2. Load Models
|
| 34 |
-
print("Loading ALISTO Brains...")
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
TFIDF_PATH = os.path.join(MODEL_DIR, 'tfidf_ensemble.pkl')
|
| 38 |
|
| 39 |
-
# A. RoBERTa (XLM-R Multilingual)
|
| 40 |
-
# loads the RoBERTa tokenizer and sequence classification model (Context Expert)
|
| 41 |
try:
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
roberta_model.to(device)
|
| 46 |
roberta_model.eval()
|
| 47 |
-
print("β
Context Expert
|
|
|
|
| 48 |
except Exception as e:
|
| 49 |
-
print(f"β Error loading
|
| 50 |
exit()
|
| 51 |
|
| 52 |
# B. TF-IDF (The Gatekeeper)
|
| 53 |
-
# loads the pre-trained TF-IDF vectorizer and ensemble model (Gatekeeper)
|
| 54 |
try:
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
tfidf_model = pickle.load(f)
|
| 57 |
print("β
Gatekeeper (TF-IDF) loaded")
|
| 58 |
except Exception as e:
|
|
|
|
| 11 |
from models import db, DisasterPost
|
| 12 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 13 |
from ner_extractor import extract_entities
|
| 14 |
+
from huggingface_hub import hf_hub_download
|
| 15 |
|
| 16 |
# 1. Config & Setup
|
| 17 |
# defines the subreddits to be monitored by the scraper
|
|
|
|
| 32 |
db.init_app(app)
|
| 33 |
|
| 34 |
# 2. Load Models
|
| 35 |
+
print("Loading ALISTO Brains from Cloud...")
|
| 36 |
+
# Point to your new Model Repository
|
| 37 |
+
MODEL_ID = "Quivara/alisto-brain"
|
|
|
|
| 38 |
|
|
|
|
|
|
|
| 39 |
try:
|
| 40 |
+
# Load Tokenizer
|
| 41 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
| 42 |
+
|
| 43 |
+
# Load Model (Num labels must match your training, usually 2 for urgent/not urgent)
|
| 44 |
+
roberta_model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, num_labels=2)
|
| 45 |
+
|
| 46 |
+
device = torch.device("cpu")
|
| 47 |
roberta_model.to(device)
|
| 48 |
roberta_model.eval()
|
| 49 |
+
print(f"β
Context Expert loaded from {MODEL_ID}")
|
| 50 |
+
|
| 51 |
except Exception as e:
|
| 52 |
+
print(f"β Error loading Model: {e}")
|
| 53 |
exit()
|
| 54 |
|
| 55 |
# B. TF-IDF (The Gatekeeper)
|
|
|
|
| 56 |
try:
|
| 57 |
+
print("Downloading Gatekeeper (TF-IDF)...")
|
| 58 |
+
# Downloads the file from your alisto-brain repo to a cache folder
|
| 59 |
+
tfidf_path = hf_hub_download(repo_id=MODEL_ID, filename="tfidf_ensemble.pkl")
|
| 60 |
+
|
| 61 |
+
with open(tfidf_path, 'rb') as f:
|
| 62 |
tfidf_model = pickle.load(f)
|
| 63 |
print("β
Gatekeeper (TF-IDF) loaded")
|
| 64 |
except Exception as e:
|