Spaces:

egumasa
/

simple-text-analyzer

Building

App Files Files Community

simple-text-analyzer / debug_bigram_trigram.py

egumasa

emuTAALES

e7279e4 7 months ago

raw

history blame contribute delete

2.51 kB

	#!/usr/bin/env python3
	"""
	Debug script to test bigram and trigram processing
	"""

	import sys
	import os

	# Add the project root to the path
	sys.path.insert(0, os.getcwd())

	from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
	from web_app.config_manager import ConfigManager

	# Test simple text
	test_text = "The cat sat on the mat. The dog ran quickly."

	# Create analyzer
	analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md')

	# Load config
	config = ConfigManager.load_reference_config()
	english_config = config.get('english', {})

	print("=== Available Reference Lists ===")
	for ngram_type, lists in english_config.items():
	print(f"\n{ngram_type.upper()}:")
	for list_name, list_config in lists.items():
	if list_config.get('enabled', True):
	print(f" - {list_name}")

	# Test loading a bigram reference
	print("\n=== Testing Bigram Reference Loading ===")
	bigram_config = english_config.get('bigrams', {}).get('COCA_spoken_bigram_frequency_token', {})
	if bigram_config:
	print(f"Config: {bigram_config}")

	# Load the data
	data = ConfigManager.load_reference_list_data(bigram_config)
	print(f"Loaded data keys: {data.keys()}")

	if 'bigram' in data:
	bigram_df = data['bigram']
	print(f"Bigram DataFrame shape: {bigram_df.shape}")
	print(f"Bigram DataFrame columns: {list(bigram_df.columns)}")
	print("First 5 bigrams:")
	print(bigram_df.head())

	# Test with full reference list structure
	print("\n=== Testing Analyzer with Bigram References ===")
	reference_lists = {
	'COCA_spoken_bigram_frequency_token': ConfigManager.load_reference_list_data(bigram_config)
	}

	print(f"Reference lists for analyzer: {list(reference_lists.keys())}")
	for name, data in reference_lists.items():
	print(f" {name}: {list(data.keys())}")

	# Load into analyzer
	analyzer.load_reference_lists(reference_lists)

	# Analyze text
	results = analyzer.analyze_text(
	test_text,
	list(reference_lists.keys()),
	apply_log=False
	)

	print("\n=== Analysis Results ===")
	print(f"Summary keys: {list(results['summary'].keys())}")
	print(f"Raw scores keys: {list(results['raw_scores'].keys())}")
	print(f"Bigram details count: {len(results.get('bigram_details', []))}")
	print(f"Trigram details count: {len(results.get('trigram_details', []))}")

	if results.get('bigram_details'):
	print("\nFirst few bigram details:")
	for detail in results['bigram_details'][:3]:
	print(f" {detail}")