Spaces:

parthnuwal7
/

ido

Running

ido / test_micro_topics.py

Parthnuwal7

Adding backend to HF spaces

27d04ef 2 months ago

4.87 kB

	"""
	Test script for new micro topic extraction v2.
	Uses the testing_json.json file.
	"""

	import sys
	import io
	import json
	from pathlib import Path

	# Fix Windows terminal encoding
	sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')

	# Add backend to path
	sys.path.insert(0, str(Path(__file__).parent))

	from services.micro_topic_service import (
	extract_micro_topics_v2,
	process_events_batch,
	get_aggregated_topics,
	extract_hashtags,
	process_english_text,
	process_hinglish_text,
	process_hindi_text
	)


	def load_test_data():
	"""Load the testing JSON file."""
	test_file = Path(__file__).parent.parent / "testing_json.json"
	if not test_file.exists():
	print(f"[ERROR] Test file not found: {test_file}")
	return None

	with open(test_file, 'r', encoding='utf-8') as f:
	return json.load(f)


	def test_hashtag_extraction():
	print("\n" + "="*60)
	print("TEST: Hashtag Extraction")
	print("="*60)

	test_cases = [
	"#familyguy #shorts",
	"Cabin crew layover in India #recommended #travel #cabincrew #emirates #india #kerala #kochi",
	"No hashtags here",
	"#हिंदी #test" # Mixed
	]

	for text in test_cases:
	hashtags = extract_hashtags(text)
	print(f"\nInput: {text}")
	print(f"Hashtags: {hashtags}")


	def test_english_pipeline():
	print("\n" + "="*60)
	print("TEST: English Pipeline (NER + Nouns)")
	print("="*60)

	test_cases = [
	"this man wanted to break india \| us sad over bangladeshi leader's death \| by prashant dhawan",
	"india vs sri lanka \| semi-final \| highlights \| dp world men's u-19 asia cup 2025",
	"drinking hot milk in india"
	]

	for text in test_cases:
	ner, nouns = process_english_text(text)
	print(f"\nInput: {text}")
	print(f"NER: {ner}")
	print(f"Nouns: {nouns}")


	def test_hinglish_pipeline():
	print("\n" + "="*60)
	print("TEST: Hinglish Pipeline (Remove Hindi -> English Pipeline)")
	print("="*60)

	test_cases = [
	"आखरी अमावस्या और विनाश ?\|\| swami yo",
	"भारत जोड़ो यात्रा \| rahul gandhi \| congress",
	"modi speech in hindi राम मंदिर #ayodhya"
	]

	for text in test_cases:
	text_v1, ner, nouns = process_hinglish_text(text)
	print(f"\nInput: {text}")
	print(f"text_v1: {text_v1}")
	print(f"NER: {ner}")
	print(f"Nouns: {nouns}")


	def test_full_pipeline():
	print("\n" + "="*60)
	print("TEST: Full Pipeline on testing_json.json")
	print("="*60)

	data = load_test_data()
	if not data:
	return

	events = data.get("events", [])
	print(f"\nLoaded {len(events)} events")

	# Process all events
	processed = process_events_batch(events)

	# Show results for each event
	for i, event in enumerate(processed):
	print(f"\n--- Event {i+1} ---")
	print(f"Type: {event.get('type')}, Engagement: {event.get('engagement')}")
	print(f"Language: {event.get('language_type')}")
	print(f"Text: {event.get('text_clean', '')[:80]}...")
	print(f"Hashtags: {event.get('hashtags', [])}")
	print(f"NER: {event.get('ner', [])}")
	print(f"Nouns: {event.get('nouns', [])}")
	if event.get('text_v1'):
	print(f"text_v1: {event.get('text_v1')}")
	print(f"Micro Topics: {event.get('micro_topics', [])}")

	# Aggregated stats
	print("\n" + "="*60)
	print("AGGREGATED RESULTS")
	print("="*60)

	aggregated = get_aggregated_topics(processed)

	print(f"\nStats: {aggregated['stats']}")
	print(f"\nTop Hashtags: {aggregated['top_hashtags'][:10]}")
	print(f"\nTop NER: {aggregated['top_ner'][:10]}")
	print(f"\nTop Nouns: {aggregated['top_nouns'][:10]}")
	print(f"\nTop Micro Topics: {aggregated['top_micro_topics'][:15]}")


	if __name__ == "__main__":
	print("="*60)
	print("Micro Topic Extraction v2 - Test Suite")
	print("="*60)

	# Check if models are available
	print("\nChecking models...")

	try:
	import spacy
	nlp = spacy.load("en_core_web_md")
	print("[OK] spaCy en_core_web_md loaded")
	except:
	print("[X] spaCy en_core_web_md not available")
	print(" Run: python -m spacy download en_core_web_md")

	try:
	import stanza
	print("[OK] Stanza package available")
	except:
	print("[X] Stanza not installed")
	print(" Run: pip install stanza")

	# Run tests
	test_hashtag_extraction()
	test_english_pipeline()
	test_hinglish_pipeline()
	test_full_pipeline()

	print("\n" + "="*60)
	print("ALL TESTS COMPLETED")
	print("="*60)