File size: 5,115 Bytes
11a0fc5 a4612d4 11a0fc5 a4612d4 11a0fc5 a4612d4 11a0fc5 a4612d4 11a0fc5 5a13d2c 11a0fc5 5a13d2c 11a0fc5 5a13d2c a4612d4 11a0fc5 a4612d4 11a0fc5 a4612d4 11a0fc5 a4612d4 11a0fc5 a4612d4 11a0fc5 5a13d2c 11a0fc5 5a13d2c 11a0fc5 a4612d4 5a13d2c a4612d4 11a0fc5 a4612d4 11a0fc5 5a13d2c 11a0fc5 5a13d2c 11a0fc5 a4612d4 11a0fc5 a4612d4 11a0fc5 a4612d4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 | """
backend/scraper.py
==================
Fetches live YouTube chat comments, runs sentiment + topic classification,
and pushes results to Redis.
Accepts optional CLI arguments so multiple instances can run in parallel:
python -m backend.scraper --video_id VIDEO_ID --redis_key chat_messages_a
Defaults fall back to config.py values.
"""
import argparse
import json
import logging
import time
from datetime import datetime
import pytchat
import redis
from backend.config import (
VIDEO_ID as DEFAULT_VIDEO_ID,
REDIS_HOST,
REDIS_PORT,
REDIS_DB,
)
from ml.sentiment_model import predict_sentiment
from ml.topic_model import predict_topic, VALID_TOPICS
from ml.action_type_model import predict_action_type, VALID_ACTION_TYPES
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
datefmt="%H:%M:%S",
)
logger = logging.getLogger("scraper")
MAX_REDIS_MESSAGES = 40000
def _safe_sentiment(text: str) -> tuple[str, float]:
try:
return predict_sentiment(text)
except Exception as exc:
logger.error("predict_sentiment failed for %r: %s", text[:60], exc)
return "Neutral", 0.50
def _safe_topic(text: str) -> tuple[str, float]:
try:
topic, conf = predict_topic(text)
if topic not in VALID_TOPICS:
return "General", 0.50
return topic, conf
except Exception as exc:
logger.error("predict_topic failed for %r: %s", text[:60], exc)
return "General", 0.50
def _safe_action_type(text: str) -> tuple[str, float]:
try:
action_type, conf = predict_action_type(text)
if action_type not in VALID_ACTION_TYPES:
return "N/A", 0.50
return action_type, conf
except Exception as exc:
logger.error("predict_action_type failed for %r: %s", text[:60], exc)
return "N/A", 0.50
def run(video_id: str, redis_key: str) -> None:
r = redis.Redis(
host=REDIS_HOST,
port=REDIS_PORT,
db=REDIS_DB,
decode_responses=True,
socket_connect_timeout=5,
)
try:
r.ping()
logger.info("Redis connected ✓")
except redis.ConnectionError as e:
logger.error("Cannot connect to Redis: %s", e)
raise SystemExit(1)
logger.info("Starting scraper — video=%s redis_key=%s", video_id, redis_key)
chat = pytchat.create(video_id=video_id)
if not chat.is_alive():
logger.error("Could not connect to live chat for %s. Is the stream live?", video_id)
return
logger.info("Live chat connected ✓ — press Ctrl+C to stop")
while chat.is_alive():
try:
for c in chat.get().sync_items():
# pytchat converts emoji to :name: codes — convert back to actual characters
import emoji as _emoji
raw_text = c.message.strip()
text = _emoji.emojize(raw_text, language="alias")
author = c.author.name
if not text:
continue
sentiment, s_conf = _safe_sentiment(text)
topic, t_conf = _safe_topic(text)
# Only classify action type for Question/Request topics
if topic in ("Question", "Request/Feedback"):
action_type, at_conf = _safe_action_type(text)
else:
action_type, at_conf = "N/A", 0.50
message_data = {
"author": author,
"text": text,
"sentiment": sentiment,
"confidence": round(s_conf, 3),
"topic": topic,
"topic_conf": round(t_conf, 3),
"action_type": action_type,
"action_type_conf": round(at_conf, 3),
"time": datetime.now().isoformat(),
}
pipe = r.pipeline()
pipe.rpush(redis_key, json.dumps(message_data))
pipe.ltrim(redis_key, -MAX_REDIS_MESSAGES, -1)
pipe.execute()
logger.info(
"[%s] %s | %s(%.2f) %s(%.2f) %s(%.2f) | %r",
message_data["time"][11:19],
author[:20],
sentiment, s_conf,
topic, t_conf,
action_type, at_conf,
text[:60],
)
except KeyboardInterrupt:
logger.info("Stopped by user.")
break
except Exception as exc:
logger.error("Unexpected error: %s", exc, exc_info=True)
time.sleep(1)
logger.info("Chat stream ended — key=%s", redis_key)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--video_id", default=DEFAULT_VIDEO_ID, help="YouTube video ID")
parser.add_argument("--redis_key", default="chat_messages", help="Redis list key to write to")
args = parser.parse_args()
run(video_id=args.video_id, redis_key=args.redis_key)
|