Spaces:
Running
Running
| """ | |
| Test script for new micro topic extraction v2. | |
| Uses the testing_json.json file. | |
| """ | |
| import sys | |
| import io | |
| import json | |
| from pathlib import Path | |
| # Fix Windows terminal encoding | |
| sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace') | |
| # Add backend to path | |
| sys.path.insert(0, str(Path(__file__).parent)) | |
| from services.micro_topic_service import ( | |
| extract_micro_topics_v2, | |
| process_events_batch, | |
| get_aggregated_topics, | |
| extract_hashtags, | |
| process_english_text, | |
| process_hinglish_text, | |
| process_hindi_text | |
| ) | |
| def load_test_data(): | |
| """Load the testing JSON file.""" | |
| test_file = Path(__file__).parent.parent / "testing_json.json" | |
| if not test_file.exists(): | |
| print(f"[ERROR] Test file not found: {test_file}") | |
| return None | |
| with open(test_file, 'r', encoding='utf-8') as f: | |
| return json.load(f) | |
| def test_hashtag_extraction(): | |
| print("\n" + "="*60) | |
| print("TEST: Hashtag Extraction") | |
| print("="*60) | |
| test_cases = [ | |
| "#familyguy #shorts", | |
| "Cabin crew layover in India #recommended #travel #cabincrew #emirates #india #kerala #kochi", | |
| "No hashtags here", | |
| "#हिंदी #test" # Mixed | |
| ] | |
| for text in test_cases: | |
| hashtags = extract_hashtags(text) | |
| print(f"\nInput: {text}") | |
| print(f"Hashtags: {hashtags}") | |
| def test_english_pipeline(): | |
| print("\n" + "="*60) | |
| print("TEST: English Pipeline (NER + Nouns)") | |
| print("="*60) | |
| test_cases = [ | |
| "this man wanted to break india | us sad over bangladeshi leader's death | by prashant dhawan", | |
| "india vs sri lanka | semi-final | highlights | dp world men's u-19 asia cup 2025", | |
| "drinking hot milk in india" | |
| ] | |
| for text in test_cases: | |
| ner, nouns = process_english_text(text) | |
| print(f"\nInput: {text}") | |
| print(f"NER: {ner}") | |
| print(f"Nouns: {nouns}") | |
| def test_hinglish_pipeline(): | |
| print("\n" + "="*60) | |
| print("TEST: Hinglish Pipeline (Remove Hindi -> English Pipeline)") | |
| print("="*60) | |
| test_cases = [ | |
| "आखरी अमावस्या और विनाश ?|| swami yo", | |
| "भारत जोड़ो यात्रा | rahul gandhi | congress", | |
| "modi speech in hindi राम मंदिर #ayodhya" | |
| ] | |
| for text in test_cases: | |
| text_v1, ner, nouns = process_hinglish_text(text) | |
| print(f"\nInput: {text}") | |
| print(f"text_v1: {text_v1}") | |
| print(f"NER: {ner}") | |
| print(f"Nouns: {nouns}") | |
| def test_full_pipeline(): | |
| print("\n" + "="*60) | |
| print("TEST: Full Pipeline on testing_json.json") | |
| print("="*60) | |
| data = load_test_data() | |
| if not data: | |
| return | |
| events = data.get("events", []) | |
| print(f"\nLoaded {len(events)} events") | |
| # Process all events | |
| processed = process_events_batch(events) | |
| # Show results for each event | |
| for i, event in enumerate(processed): | |
| print(f"\n--- Event {i+1} ---") | |
| print(f"Type: {event.get('type')}, Engagement: {event.get('engagement')}") | |
| print(f"Language: {event.get('language_type')}") | |
| print(f"Text: {event.get('text_clean', '')[:80]}...") | |
| print(f"Hashtags: {event.get('hashtags', [])}") | |
| print(f"NER: {event.get('ner', [])}") | |
| print(f"Nouns: {event.get('nouns', [])}") | |
| if event.get('text_v1'): | |
| print(f"text_v1: {event.get('text_v1')}") | |
| print(f"Micro Topics: {event.get('micro_topics', [])}") | |
| # Aggregated stats | |
| print("\n" + "="*60) | |
| print("AGGREGATED RESULTS") | |
| print("="*60) | |
| aggregated = get_aggregated_topics(processed) | |
| print(f"\nStats: {aggregated['stats']}") | |
| print(f"\nTop Hashtags: {aggregated['top_hashtags'][:10]}") | |
| print(f"\nTop NER: {aggregated['top_ner'][:10]}") | |
| print(f"\nTop Nouns: {aggregated['top_nouns'][:10]}") | |
| print(f"\nTop Micro Topics: {aggregated['top_micro_topics'][:15]}") | |
| if __name__ == "__main__": | |
| print("="*60) | |
| print("Micro Topic Extraction v2 - Test Suite") | |
| print("="*60) | |
| # Check if models are available | |
| print("\nChecking models...") | |
| try: | |
| import spacy | |
| nlp = spacy.load("en_core_web_md") | |
| print("[OK] spaCy en_core_web_md loaded") | |
| except: | |
| print("[X] spaCy en_core_web_md not available") | |
| print(" Run: python -m spacy download en_core_web_md") | |
| try: | |
| import stanza | |
| print("[OK] Stanza package available") | |
| except: | |
| print("[X] Stanza not installed") | |
| print(" Run: pip install stanza") | |
| # Run tests | |
| test_hashtag_extraction() | |
| test_english_pipeline() | |
| test_hinglish_pipeline() | |
| test_full_pipeline() | |
| print("\n" + "="*60) | |
| print("ALL TESTS COMPLETED") | |
| print("="*60) | |