ido / test_micro_topics.py
Parthnuwal7
Adding backend to HF spaces
27d04ef
"""
Test script for new micro topic extraction v2.
Uses the testing_json.json file.
"""
import sys
import io
import json
from pathlib import Path
# Fix Windows terminal encoding
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
# Add backend to path
sys.path.insert(0, str(Path(__file__).parent))
from services.micro_topic_service import (
extract_micro_topics_v2,
process_events_batch,
get_aggregated_topics,
extract_hashtags,
process_english_text,
process_hinglish_text,
process_hindi_text
)
def load_test_data():
"""Load the testing JSON file."""
test_file = Path(__file__).parent.parent / "testing_json.json"
if not test_file.exists():
print(f"[ERROR] Test file not found: {test_file}")
return None
with open(test_file, 'r', encoding='utf-8') as f:
return json.load(f)
def test_hashtag_extraction():
print("\n" + "="*60)
print("TEST: Hashtag Extraction")
print("="*60)
test_cases = [
"#familyguy #shorts",
"Cabin crew layover in India #recommended #travel #cabincrew #emirates #india #kerala #kochi",
"No hashtags here",
"#हिंदी #test" # Mixed
]
for text in test_cases:
hashtags = extract_hashtags(text)
print(f"\nInput: {text}")
print(f"Hashtags: {hashtags}")
def test_english_pipeline():
print("\n" + "="*60)
print("TEST: English Pipeline (NER + Nouns)")
print("="*60)
test_cases = [
"this man wanted to break india | us sad over bangladeshi leader's death | by prashant dhawan",
"india vs sri lanka | semi-final | highlights | dp world men's u-19 asia cup 2025",
"drinking hot milk in india"
]
for text in test_cases:
ner, nouns = process_english_text(text)
print(f"\nInput: {text}")
print(f"NER: {ner}")
print(f"Nouns: {nouns}")
def test_hinglish_pipeline():
print("\n" + "="*60)
print("TEST: Hinglish Pipeline (Remove Hindi -> English Pipeline)")
print("="*60)
test_cases = [
"आखरी अमावस्या और विनाश ?|| swami yo",
"भारत जोड़ो यात्रा | rahul gandhi | congress",
"modi speech in hindi राम मंदिर #ayodhya"
]
for text in test_cases:
text_v1, ner, nouns = process_hinglish_text(text)
print(f"\nInput: {text}")
print(f"text_v1: {text_v1}")
print(f"NER: {ner}")
print(f"Nouns: {nouns}")
def test_full_pipeline():
print("\n" + "="*60)
print("TEST: Full Pipeline on testing_json.json")
print("="*60)
data = load_test_data()
if not data:
return
events = data.get("events", [])
print(f"\nLoaded {len(events)} events")
# Process all events
processed = process_events_batch(events)
# Show results for each event
for i, event in enumerate(processed):
print(f"\n--- Event {i+1} ---")
print(f"Type: {event.get('type')}, Engagement: {event.get('engagement')}")
print(f"Language: {event.get('language_type')}")
print(f"Text: {event.get('text_clean', '')[:80]}...")
print(f"Hashtags: {event.get('hashtags', [])}")
print(f"NER: {event.get('ner', [])}")
print(f"Nouns: {event.get('nouns', [])}")
if event.get('text_v1'):
print(f"text_v1: {event.get('text_v1')}")
print(f"Micro Topics: {event.get('micro_topics', [])}")
# Aggregated stats
print("\n" + "="*60)
print("AGGREGATED RESULTS")
print("="*60)
aggregated = get_aggregated_topics(processed)
print(f"\nStats: {aggregated['stats']}")
print(f"\nTop Hashtags: {aggregated['top_hashtags'][:10]}")
print(f"\nTop NER: {aggregated['top_ner'][:10]}")
print(f"\nTop Nouns: {aggregated['top_nouns'][:10]}")
print(f"\nTop Micro Topics: {aggregated['top_micro_topics'][:15]}")
if __name__ == "__main__":
print("="*60)
print("Micro Topic Extraction v2 - Test Suite")
print("="*60)
# Check if models are available
print("\nChecking models...")
try:
import spacy
nlp = spacy.load("en_core_web_md")
print("[OK] spaCy en_core_web_md loaded")
except:
print("[X] spaCy en_core_web_md not available")
print(" Run: python -m spacy download en_core_web_md")
try:
import stanza
print("[OK] Stanza package available")
except:
print("[X] Stanza not installed")
print(" Run: pip install stanza")
# Run tests
test_hashtag_extraction()
test_english_pipeline()
test_hinglish_pipeline()
test_full_pipeline()
print("\n" + "="*60)
print("ALL TESTS COMPLETED")
print("="*60)