Spaces:

parthnuwal7
/

ido

Running

ido / services /micro_topic_service.py

Parthnuwal7

Adding backend to HF spaces

27d04ef 2 months ago

19.4 kB

	"""
	Micro Topic Extraction Service v2.2

	Optimizations:
	- Batch processing with spaCy nlp.pipe() (5-10x faster)
	- Progress logging for large datasets

	Fixes:
	- Fix 1: Multi-word NER subsumption (drop unigram components)
	- Fix 2: Text sanity for NER (clean artifacts like "#emirates #")
	- Fix 3: Weak noun blacklist (drop generic nouns unless in NER)
	- Fix 4: Noun sanity (clean "city.#history" -> "city")

	Pipeline:
	- Only processes events with engagement="active" and type="watch"
	- Extracts hashtags -> stored in `hashtags` field
	- English: NER (en_core_web_md) -> `ner`, Nouns (POS) -> `nouns`
	- Hinglish: Remove Devanagari -> `text_v1`, then apply English pipeline
	- Hindi: Stanza NER + Noun extraction
	- Final: Aggregate, filter, and deduplicate -> `micro_topics`
	"""

	import re
	import unicodedata
	from typing import List, Dict, Set, Optional, Tuple
	from collections import Counter

	# =========================================================
	# STOPWORDS
	# =========================================================

	ENGLISH_STOPWORDS = {
	"a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
	"have", "has", "had", "do", "does", "did", "will", "would", "could",
	"should", "may", "might", "must", "shall", "can", "need", "dare",
	"ought", "used", "to", "of", "in", "for", "on", "with", "at", "by",
	"from", "as", "into", "through", "during", "before", "after", "above",
	"below", "between", "under", "again", "further", "then", "once", "here",
	"there", "when", "where", "why", "how", "all", "each", "few", "more",
	"most", "other", "some", "such", "no", "nor", "not", "only", "own",
	"same", "so", "than", "too", "very", "just", "also", "now", "and",
	"but", "if", "or", "because", "until", "while", "this", "that", "these",
	"those", "what", "which", "who", "whom", "whose", "i", "you", "he",
	"she", "it", "we", "they", "me", "him", "her", "us", "them", "my",
	"your", "his", "its", "our", "their", "myself", "yourself", "himself",
	"herself", "itself", "ourselves", "themselves", "am", "about", "get",
	"got", "go", "going", "went", "come", "came", "make", "made", "take",
	"took", "see", "saw", "know", "knew", "think", "thought", "want",
	"like", "look", "use", "find", "give", "tell", "say", "said", "video",
	"watch", "watched", "new", "first", "last", "best", "top", "full",
	"part", "episode", "ep", "vs", "ft", "feat", "official", "exclusive",
	"shorts", "short", "movie", "clip", "scene", "trailer", "teaser"
	}

	HINDI_STOPWORDS = {
	"का", "के", "की", "है", "हैं", "था", "थे", "थी", "में", "से", "को",
	"पर", "ने", "और", "या", "एक", "यह", "वह", "इस", "उस", "जो", "तो",
	"भी", "कर", "हो", "ही", "अब", "जब", "तक", "बहुत", "कुछ", "सब",
	"कोई", "किसी", "अपने", "उनके", "इनके", "वाले", "वाली", "वाला"
	}

	WEAK_NOUNS = {
	"man", "men", "woman", "women", "person", "people", "guy", "guys",
	"leader", "leaders", "member", "members", "player", "players",
	"team", "teams", "group", "groups", "family", "families",
	"death", "life", "time", "day", "days", "night", "nights",
	"year", "years", "month", "months", "week", "weeks",
	"thing", "things", "stuff", "way", "ways",
	"world", "place", "places", "area", "areas", "country", "countries",
	"city", "cities", "town", "towns", "home", "house",
	"end", "start", "beginning", "part", "parts", "side", "sides",
	"point", "points", "case", "cases", "fact", "facts",
	"news", "update", "updates", "story", "stories",
	"channel", "channels", "subscriber", "subscribers",
	"view", "views", "like", "likes", "comment", "comments",
	"reaction", "reactions", "highlight", "highlights",
	"moment", "moments", "episode", "episodes"
	}

	# =========================================================
	# LAZY MODEL LOADING
	# =========================================================

	_nlp_en_md = None
	_stanza_hi = None


	def get_spacy_english_md():
	"""Lazy load spaCy English medium model."""
	global _nlp_en_md
	if _nlp_en_md is None:
	try:
	import spacy
	_nlp_en_md = spacy.load("en_core_web_md")
	print("[OK] Loaded en_core_web_md")
	except OSError:
	print("[ERROR] en_core_web_md not found. Run: python -m spacy download en_core_web_md")
	_nlp_en_md = None
	return _nlp_en_md


	def get_stanza_hindi():
	"""Lazy load Stanza Hindi pipeline."""
	global _stanza_hi
	if _stanza_hi is None:
	try:
	import stanza
	stanza.download('hi', verbose=False)
	_stanza_hi = stanza.Pipeline('hi', processors='tokenize,pos,ner', verbose=False)
	print("[OK] Loaded Stanza Hindi pipeline")
	except Exception as e:
	print(f"[ERROR] Stanza Hindi failed: {e}")
	_stanza_hi = None
	return _stanza_hi


	# =========================================================
	# TEXT CLEANING UTILITIES
	# =========================================================

	def normalize_unicode(text: str) -> str:
	"""Unicode normalization (NFC) and zero-width character removal."""
	if not text:
	return ""
	text = unicodedata.normalize('NFC', text)
	text = re.sub(r'[\u200b-\u200f\u202a-\u202e\ufeff]', '', text)
	text = re.sub(r'\s+', ' ', text).strip()
	return text


	def remove_devanagari(text: str) -> str:
	"""Remove Devanagari (Hindi) characters from text."""
	return re.sub(r'[\u0900-\u097F]+', ' ', text)


	def remove_stopwords(text: str, stopwords: Set[str]) -> str:
	"""Remove stopwords from text."""
	words = text.split()
	filtered = [w for w in words if w.lower() not in stopwords]
	return ' '.join(filtered)


	def clean_text_v1(text: str) -> str:
	"""Clean text for Hinglish processing."""
	text = remove_devanagari(text)
	text = normalize_unicode(text)
	text = remove_stopwords(text, ENGLISH_STOPWORDS)
	text = re.sub(r'[\|]+', ' ', text)
	text = re.sub(r'\s+', ' ', text).strip()
	return text


	def sanitize_topic(text: str) -> str:
	"""
	FIX 2 & 4: Sanitize any topic text (NER, noun, etc).
	Handles cases like:
	- "#emirates #" -> "emirates"
	- "city.#history" -> "city"
	- "world—kuwait" -> "world kuwait"
	"""
	if not text:
	return ""

	# Remove hashtag patterns (including attached ones like ".#history")
	text = re.sub(r'\.?#\w*', '', text)

	# Replace em-dash and other dashes with space
	text = re.sub(r'[—–\-]+', ' ', text)

	# Remove leading/trailing punctuation and special chars
	text = text.strip(' #@\|[](){}.,!?:;"\'-_')

	# Collapse spaces
	text = re.sub(r'\s+', ' ', text).strip()

	return text


	# =========================================================
	# HASHTAG EXTRACTION
	# =========================================================

	def extract_hashtags(text: str) -> List[str]:
	"""Extract hashtags from text. Returns list of hashtag values (without #)."""
	if not text:
	return []

	hashtags = re.findall(r'#(\w+)', text, re.IGNORECASE)

	seen = set()
	result = []
	for tag in hashtags:
	tag_lower = tag.lower()
	if tag_lower not in seen and len(tag_lower) >= 2:
	result.append(tag_lower)
	seen.add(tag_lower)

	return result


	# =========================================================
	# BATCH PROCESSING FOR SPACY
	# =========================================================

	def process_texts_batch_english(texts: List[str], batch_size: int = 100) -> List[Tuple[List[str], List[str]]]:
	"""
	Process multiple texts through spaCy using nlp.pipe() for efficiency.

	Returns list of (ner_list, noun_list) tuples, one per input text.
	5-10x faster than processing individually.
	"""
	nlp = get_spacy_english_md()
	if nlp is None:
	return [([], []) for _ in texts]

	# Filter out None/empty texts and track indices
	valid_indices = []
	valid_texts = []
	for i, text in enumerate(texts):
	if text and isinstance(text, str) and text.strip():
	valid_indices.append(i)
	valid_texts.append(text)

	# Initialize results for all texts (including empty ones)
	results = [([], []) for _ in texts]

	if not valid_texts:
	return results

	target_labels = {"PERSON", "ORG", "GPE", "LOC", "EVENT", "NORP", "FAC", "PRODUCT", "WORK_OF_ART"}

	# Use nlp.pipe for batch processing on valid texts only
	for idx, doc in enumerate(nlp.pipe(valid_texts, batch_size=batch_size)):
	original_idx = valid_indices[idx]

	# Extract NER
	ner = []
	ner_seen = set()
	for ent in doc.ents:
	if ent.label_ in target_labels:
	ent_text = sanitize_topic(ent.text).lower()
	if ent_text and ent_text not in ner_seen and len(ent_text) >= 2:
	ner.append(ent_text)
	ner_seen.add(ent_text)

	# Create protected set from NER
	protected = set()
	for entity in ner:
	for word in entity.split():
	protected.add(word.lower())

	# Extract nouns
	nouns = []
	noun_seen = set()
	for token in doc:
	if token.pos_ in ("NOUN", "PROPN"):
	# FIX 4: Sanitize noun text
	noun_text = sanitize_topic(token.text).lower()

	if not noun_text or len(noun_text) < 2:
	continue
	if noun_text in noun_seen:
	continue
	if noun_text in ENGLISH_STOPWORDS:
	continue
	if noun_text.isdigit():
	continue
	if noun_text in WEAK_NOUNS and noun_text not in protected:
	continue

	nouns.append(noun_text)
	noun_seen.add(noun_text)

	results[original_idx] = (ner, nouns)

	return results


	# =========================================================
	# HINDI PIPELINE (Stanza) - Not batched (used less frequently)
	# =========================================================

	def process_hindi_text(text: str) -> Tuple[List[str], List[str]]:
	"""Process Hindi text through Stanza. Returns (ner_list, noun_list)."""
	if not text:
	return [], []

	stanza_pipeline = get_stanza_hindi()
	if stanza_pipeline is None:
	return [], []

	text = normalize_unicode(text)

	try:
	doc = stanza_pipeline(text)

	# NER
	ner = []
	ner_seen = set()
	for sentence in doc.sentences:
	for ent in sentence.ents:
	ent_text = sanitize_topic(ent.text)
	if ent_text and ent_text not in ner_seen and len(ent_text) >= 2:
	ner.append(ent_text)
	ner_seen.add(ent_text)

	# Nouns
	nouns = []
	noun_seen = set()
	for sentence in doc.sentences:
	for word in sentence.words:
	if word.upos in ("NOUN", "PROPN"):
	noun_text = sanitize_topic(word.text)
	if (noun_text and
	noun_text not in noun_seen and
	len(noun_text) >= 2 and
	noun_text not in HINDI_STOPWORDS):
	nouns.append(noun_text)
	noun_seen.add(noun_text)

	return ner, nouns
	except Exception as e:
	print(f"[ERROR] Hindi processing failed: {e}")
	return [], []


	# =========================================================
	# NER SUBSUMPTION
	# =========================================================

	def apply_ner_subsumption(all_topics: List[str], ner_entities: List[str]) -> List[str]:
	"""
	FIX 1: If a multi-word NER exists, drop its unigram components.
	Example: If "greg biffle" in topics -> drop "greg" and "biffle"
	"""
	subsumed = set()
	for entity in ner_entities:
	words = entity.split()
	if len(words) > 1:
	for word in words:
	subsumed.add(word.lower())

	filtered = []
	for topic in all_topics:
	topic_lower = topic.lower()
	if ' ' in topic_lower or topic_lower not in subsumed:
	filtered.append(topic)

	return filtered


	# =========================================================
	# MAIN BATCH EXTRACTION FUNCTION
	# =========================================================

	def process_events_batch(events: List[Dict], batch_size: int = 100) -> List[Dict]:
	"""
	Process a batch of events with optimized batch NLP processing.

	Uses spaCy's nlp.pipe() for 5-10x faster processing.
	"""
	# Filter qualifying events
	qualifying_indices = []
	texts_to_process = []
	language_types = []

	for i, event in enumerate(events):
	if event.get("type") == "watch" and event.get("engagement") == "active":
	text_clean = event.get("text_clean", "")
	lang = event.get("language_type", "").lower()

	if lang == "hinglish":
	# For hinglish, clean the text first
	text_v1 = clean_text_v1(text_clean)
	event["text_v1"] = text_v1
	texts_to_process.append(text_v1 if text_v1 else "")
	elif lang == "hindi":
	texts_to_process.append("") # Will process separately
	else:
	texts_to_process.append(text_clean)

	qualifying_indices.append(i)
	language_types.append(lang)

	total = len(qualifying_indices)
	print(f"[TOPIC] Processing {total} qualifying events...")

	if total == 0:
	return events

	# Batch process English/Hinglish texts through spaCy
	english_indices = []
	english_texts = []

	for idx, (i, lang) in enumerate(zip(qualifying_indices, language_types)):
	if lang in ("english", "hinglish", "unknown", ""):
	english_indices.append(idx)
	english_texts.append(texts_to_process[idx])

	print(f"[TOPIC] Batch processing {len(english_texts)} English/Hinglish texts...")
	english_results = process_texts_batch_english(english_texts, batch_size)

	# Map results back
	english_result_map = {}
	for idx, result in zip(english_indices, english_results):
	english_result_map[idx] = result

	# Process each event and add results
	processed_count = 0
	for idx, i in enumerate(qualifying_indices):
	event = events[i]
	text_clean = event.get("text_clean", "")
	lang = language_types[idx]

	# Extract hashtags
	hashtags = extract_hashtags(text_clean)

	# Get NER and nouns based on language
	if lang == "hindi":
	ner, nouns = process_hindi_text(text_clean)
	else:
	# Get from batch results
	ner, nouns = english_result_map.get(idx, ([], []))

	# Aggregate topics
	all_topics = []
	seen = set()

	for h in hashtags:
	h_lower = h.lower()
	if h_lower not in seen:
	all_topics.append(h_lower)
	seen.add(h_lower)

	for e in ner:
	e_lower = e.lower()
	if e_lower not in seen:
	all_topics.append(e_lower)
	seen.add(e_lower)

	for n in nouns:
	n_lower = n.lower()
	if n_lower not in seen:
	all_topics.append(n_lower)
	seen.add(n_lower)

	# Apply NER subsumption
	all_topics = apply_ner_subsumption(all_topics, ner)

	# Store results
	event["hashtags"] = hashtags
	event["ner"] = ner
	event["nouns"] = nouns
	event["micro_topics"] = all_topics

	processed_count += 1
	if processed_count % 1000 == 0:
	print(f"[TOPIC] Processed {processed_count}/{total} events...")

	print(f"[TOPIC] Completed processing {total} events.")
	return events


	# =========================================================
	# LEGACY SINGLE EVENT FUNCTION (for testing)
	# =========================================================

	def extract_micro_topics_v2(event: Dict) -> Dict:
	"""Extract micro topics from a single event. Use process_events_batch for bulk."""
	if event.get("type") != "watch" or event.get("engagement") != "active":
	return event

	text_clean = event.get("text_clean", "")
	language_type = event.get("language_type", "").lower()

	hashtags = extract_hashtags(text_clean)

	if language_type == "english":
	results = process_texts_batch_english([text_clean])
	ner, nouns = results[0] if results else ([], [])
	elif language_type == "hinglish":
	text_v1 = clean_text_v1(text_clean)
	event["text_v1"] = text_v1
	results = process_texts_batch_english([text_v1])
	ner, nouns = results[0] if results else ([], [])
	elif language_type == "hindi":
	ner, nouns = process_hindi_text(text_clean)
	else:
	results = process_texts_batch_english([text_clean])
	ner, nouns = results[0] if results else ([], [])

	all_topics = []
	seen = set()

	for h in hashtags:
	h_lower = h.lower()
	if h_lower not in seen:
	all_topics.append(h_lower)
	seen.add(h_lower)

	for e in ner:
	e_lower = e.lower()
	if e_lower not in seen:
	all_topics.append(e_lower)
	seen.add(e_lower)

	for n in nouns:
	n_lower = n.lower()
	if n_lower not in seen:
	all_topics.append(n_lower)
	seen.add(n_lower)

	all_topics = apply_ner_subsumption(all_topics, ner)

	event["hashtags"] = hashtags
	event["ner"] = ner
	event["nouns"] = nouns
	event["micro_topics"] = all_topics

	return event


	def get_aggregated_topics(events: List[Dict], top_n: int = 50) -> Dict:
	"""Aggregate micro topics across all events."""
	hashtag_counter = Counter()
	ner_counter = Counter()
	noun_counter = Counter()
	topic_counter = Counter()

	for event in events:
	hashtag_counter.update(event.get("hashtags", []))
	ner_counter.update(event.get("ner", []))
	noun_counter.update(event.get("nouns", []))
	topic_counter.update(event.get("micro_topics", []))

	return {
	"top_hashtags": [{"topic": t, "count": c} for t, c in hashtag_counter.most_common(top_n)],
	"top_ner": [{"topic": t, "count": c} for t, c in ner_counter.most_common(top_n)],
	"top_nouns": [{"topic": t, "count": c} for t, c in noun_counter.most_common(top_n)],
	"top_micro_topics": [{"topic": t, "count": c} for t, c in topic_counter.most_common(top_n)],
	"stats": {
	"total_unique_hashtags": len(hashtag_counter),
	"total_unique_ner": len(ner_counter),
	"total_unique_nouns": len(noun_counter),
	"total_unique_topics": len(topic_counter)
	}
	}