# app.py import streamlit as st import kenlm import pandas as pd import plotly.express as px from pathlib import Path import torch from datasets import load_dataset import nltk from nltk.tokenize import sent_tokenize nltk.download('punkt') class OptimizedTextProcessor: def __init__(self): self.nltk_downloaded = False def preprocess_text(self, text: str) -> list: if not self.nltk_downloaded: nltk.download('punkt') self.nltk_downloaded = True # Better sentence splitting using NLTK sentences = sent_tokenize(text.lower()) processed_sentences = [] for sentence in sentences: # Enhanced cleaning while preserving sentence structure cleaned = ' '.join( word.strip() for word in sentence.split() if any(c.isalnum() for c in word) ) if cleaned: processed_sentences.append(f" {cleaned} ") return processed_sentences class EnhancedKenLMModel: def __init__(self, model_path: str): self.model = kenlm.Model(model_path) self.processor = OptimizedTextProcessor() def evaluate_text(self, text: str) -> dict: processed_sentences = self.processor.preprocess_text(text) results = [] total_log_prob = 0 total_words = 0 for sentence in processed_sentences: score = self.model.score(sentence) words = len(sentence.split()) perplexity = pow(10.0, -score/words) results.append({ 'sentence': sentence.replace('', '').replace('', '').strip(), 'score': score, 'perplexity': perplexity, 'words': words }) total_log_prob += score total_words += words return { 'sentence_scores': results, 'average_log_prob': total_log_prob / total_words if total_words > 0 else 0, 'overall_perplexity': pow(10.0, -total_log_prob / total_words) if total_words > 0 else 0 } # Streamlit UI st.set_page_config(page_title="KenLM Text Analysis", layout="wide") st.title("Advanced Text Analysis with KenLM") st.markdown(""" ### About this app This application uses an optimized KenLM language model to analyze text quality and fluency. It provides detailed metrics and visualizations to help understand the text structure. """) # Initialize model @st.cache_resource def load_model(): return EnhancedKenLMModel("path_to_your_model.arpa") try: model = load_model() # Text input text_input = st.text_area( "Enter your text for analysis", height=150, placeholder="Enter the text you want to analyze..." ) if text_input: with st.spinner("Analyzing text..."): results = model.evaluate_text(text_input) # Display overall metrics col1, col2 = st.columns(2) with col1: st.metric("Average Log Probability", f"{results['average_log_prob']:.4f}") with col2: st.metric("Overall Perplexity", f"{results['overall_perplexity']:.4f}") # Create visualization if results['sentence_scores']: df = pd.DataFrame(results['sentence_scores']) # Perplexity plot fig_perplexity = px.bar( df, x=df.index, y='perplexity', title="Sentence Perplexity Scores" ) st.plotly_chart(fig_perplexity, use_container_width=True) # Detailed sentence analysis st.subheader("Detailed Sentence Analysis") for idx, score in enumerate(results['sentence_scores']): with st.expander(f"Sentence {idx + 1}"): st.write(f"Text: {score['sentence']}") st.write(f"Score: {score['score']:.4f}") st.write(f"Perplexity: {score['perplexity']:.4f}") st.write(f"Word count: {score['words']}") except Exception as e: st.error(f"An error occurred: {str(e)}")