kenlmApp / app.py
V8055's picture
Update app.py
4ede509 verified
# app.py
import streamlit as st
import kenlm
import pandas as pd
import plotly.express as px
from pathlib import Path
import torch
from datasets import load_dataset
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
class OptimizedTextProcessor:
def __init__(self):
self.nltk_downloaded = False
def preprocess_text(self, text: str) -> list:
if not self.nltk_downloaded:
nltk.download('punkt')
self.nltk_downloaded = True
# Better sentence splitting using NLTK
sentences = sent_tokenize(text.lower())
processed_sentences = []
for sentence in sentences:
# Enhanced cleaning while preserving sentence structure
cleaned = ' '.join(
word.strip()
for word in sentence.split()
if any(c.isalnum() for c in word)
)
if cleaned:
processed_sentences.append(f"<s> {cleaned} </s>")
return processed_sentences
class EnhancedKenLMModel:
def __init__(self, model_path: str):
self.model = kenlm.Model(model_path)
self.processor = OptimizedTextProcessor()
def evaluate_text(self, text: str) -> dict:
processed_sentences = self.processor.preprocess_text(text)
results = []
total_log_prob = 0
total_words = 0
for sentence in processed_sentences:
score = self.model.score(sentence)
words = len(sentence.split())
perplexity = pow(10.0, -score/words)
results.append({
'sentence': sentence.replace('<s>', '').replace('</s>', '').strip(),
'score': score,
'perplexity': perplexity,
'words': words
})
total_log_prob += score
total_words += words
return {
'sentence_scores': results,
'average_log_prob': total_log_prob / total_words if total_words > 0 else 0,
'overall_perplexity': pow(10.0, -total_log_prob / total_words) if total_words > 0 else 0
}
# Streamlit UI
st.set_page_config(page_title="KenLM Text Analysis", layout="wide")
st.title("Advanced Text Analysis with KenLM")
st.markdown("""
### About this app
This application uses an optimized KenLM language model to analyze text quality and fluency.
It provides detailed metrics and visualizations to help understand the text structure.
""")
# Initialize model
@st.cache_resource
def load_model():
return EnhancedKenLMModel("path_to_your_model.arpa")
try:
model = load_model()
# Text input
text_input = st.text_area(
"Enter your text for analysis",
height=150,
placeholder="Enter the text you want to analyze..."
)
if text_input:
with st.spinner("Analyzing text..."):
results = model.evaluate_text(text_input)
# Display overall metrics
col1, col2 = st.columns(2)
with col1:
st.metric("Average Log Probability", f"{results['average_log_prob']:.4f}")
with col2:
st.metric("Overall Perplexity", f"{results['overall_perplexity']:.4f}")
# Create visualization
if results['sentence_scores']:
df = pd.DataFrame(results['sentence_scores'])
# Perplexity plot
fig_perplexity = px.bar(
df,
x=df.index,
y='perplexity',
title="Sentence Perplexity Scores"
)
st.plotly_chart(fig_perplexity, use_container_width=True)
# Detailed sentence analysis
st.subheader("Detailed Sentence Analysis")
for idx, score in enumerate(results['sentence_scores']):
with st.expander(f"Sentence {idx + 1}"):
st.write(f"Text: {score['sentence']}")
st.write(f"Score: {score['score']:.4f}")
st.write(f"Perplexity: {score['perplexity']:.4f}")
st.write(f"Word count: {score['words']}")
except Exception as e:
st.error(f"An error occurred: {str(e)}")