Spaces:

parthnuwal7
/

ido

Running

File size: 4,865 Bytes

27d04ef

"""
Test script for new micro topic extraction v2.
Uses the testing_json.json file.
"""

import sys
import io
import json
from pathlib import Path

# Fix Windows terminal encoding
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')

# Add backend to path
sys.path.insert(0, str(Path(__file__).parent))

from services.micro_topic_service import (
    extract_micro_topics_v2,
    process_events_batch,
    get_aggregated_topics,
    extract_hashtags,
    process_english_text,
    process_hinglish_text,
    process_hindi_text
)


def load_test_data():
    """Load the testing JSON file."""
    test_file = Path(__file__).parent.parent / "testing_json.json"
    if not test_file.exists():
        print(f"[ERROR] Test file not found: {test_file}")
        return None
    
    with open(test_file, 'r', encoding='utf-8') as f:
        return json.load(f)


def test_hashtag_extraction():
    print("\n" + "="*60)
    print("TEST: Hashtag Extraction")
    print("="*60)
    
    test_cases = [
        "#familyguy #shorts",
        "Cabin crew layover in India #recommended #travel #cabincrew #emirates #india #kerala #kochi",
        "No hashtags here",
        "#हिंदी #test"  # Mixed
    ]
    
    for text in test_cases:
        hashtags = extract_hashtags(text)
        print(f"\nInput: {text}")
        print(f"Hashtags: {hashtags}")


def test_english_pipeline():
    print("\n" + "="*60)
    print("TEST: English Pipeline (NER + Nouns)")
    print("="*60)
    
    test_cases = [
        "this man wanted to break india | us sad over bangladeshi leader's death | by prashant dhawan",
        "india vs sri lanka | semi-final | highlights | dp world men's u-19 asia cup 2025",
        "drinking hot milk in india"
    ]
    
    for text in test_cases:
        ner, nouns = process_english_text(text)
        print(f"\nInput: {text}")
        print(f"NER: {ner}")
        print(f"Nouns: {nouns}")


def test_hinglish_pipeline():
    print("\n" + "="*60)
    print("TEST: Hinglish Pipeline (Remove Hindi -> English Pipeline)")
    print("="*60)
    
    test_cases = [
        "आखरी अमावस्या और विनाश ?|| swami yo",
        "भारत जोड़ो यात्रा | rahul gandhi | congress",
        "modi speech in hindi राम मंदिर #ayodhya"
    ]
    
    for text in test_cases:
        text_v1, ner, nouns = process_hinglish_text(text)
        print(f"\nInput: {text}")
        print(f"text_v1: {text_v1}")
        print(f"NER: {ner}")
        print(f"Nouns: {nouns}")


def test_full_pipeline():
    print("\n" + "="*60)
    print("TEST: Full Pipeline on testing_json.json")
    print("="*60)
    
    data = load_test_data()
    if not data:
        return
    
    events = data.get("events", [])
    print(f"\nLoaded {len(events)} events")
    
    # Process all events
    processed = process_events_batch(events)
    
    # Show results for each event
    for i, event in enumerate(processed):
        print(f"\n--- Event {i+1} ---")
        print(f"Type: {event.get('type')}, Engagement: {event.get('engagement')}")
        print(f"Language: {event.get('language_type')}")
        print(f"Text: {event.get('text_clean', '')[:80]}...")
        print(f"Hashtags: {event.get('hashtags', [])}")
        print(f"NER: {event.get('ner', [])}")
        print(f"Nouns: {event.get('nouns', [])}")
        if event.get('text_v1'):
            print(f"text_v1: {event.get('text_v1')}")
        print(f"Micro Topics: {event.get('micro_topics', [])}")
    
    # Aggregated stats
    print("\n" + "="*60)
    print("AGGREGATED RESULTS")
    print("="*60)
    
    aggregated = get_aggregated_topics(processed)
    
    print(f"\nStats: {aggregated['stats']}")
    print(f"\nTop Hashtags: {aggregated['top_hashtags'][:10]}")
    print(f"\nTop NER: {aggregated['top_ner'][:10]}")
    print(f"\nTop Nouns: {aggregated['top_nouns'][:10]}")
    print(f"\nTop Micro Topics: {aggregated['top_micro_topics'][:15]}")


if __name__ == "__main__":
    print("="*60)
    print("Micro Topic Extraction v2 - Test Suite")
    print("="*60)
    
    # Check if models are available
    print("\nChecking models...")
    
    try:
        import spacy
        nlp = spacy.load("en_core_web_md")
        print("[OK] spaCy en_core_web_md loaded")
    except:
        print("[X] spaCy en_core_web_md not available")
        print("    Run: python -m spacy download en_core_web_md")
    
    try:
        import stanza
        print("[OK] Stanza package available")
    except:
        print("[X] Stanza not installed")
        print("    Run: pip install stanza")
    
    # Run tests
    test_hashtag_extraction()
    test_english_pipeline()
    test_hinglish_pipeline()
    test_full_pipeline()
    
    print("\n" + "="*60)
    print("ALL TESTS COMPLETED")
    print("="*60)