|
|
|
|
|
import streamlit as st |
|
|
import kenlm |
|
|
import pandas as pd |
|
|
import plotly.express as px |
|
|
from pathlib import Path |
|
|
import torch |
|
|
from datasets import load_dataset |
|
|
import nltk |
|
|
from nltk.tokenize import sent_tokenize |
|
|
nltk.download('punkt') |
|
|
|
|
|
class OptimizedTextProcessor: |
|
|
def __init__(self): |
|
|
self.nltk_downloaded = False |
|
|
|
|
|
def preprocess_text(self, text: str) -> list: |
|
|
if not self.nltk_downloaded: |
|
|
nltk.download('punkt') |
|
|
self.nltk_downloaded = True |
|
|
|
|
|
|
|
|
sentences = sent_tokenize(text.lower()) |
|
|
processed_sentences = [] |
|
|
|
|
|
for sentence in sentences: |
|
|
|
|
|
cleaned = ' '.join( |
|
|
word.strip() |
|
|
for word in sentence.split() |
|
|
if any(c.isalnum() for c in word) |
|
|
) |
|
|
if cleaned: |
|
|
processed_sentences.append(f"<s> {cleaned} </s>") |
|
|
|
|
|
return processed_sentences |
|
|
|
|
|
class EnhancedKenLMModel: |
|
|
def __init__(self, model_path: str): |
|
|
self.model = kenlm.Model(model_path) |
|
|
self.processor = OptimizedTextProcessor() |
|
|
|
|
|
def evaluate_text(self, text: str) -> dict: |
|
|
processed_sentences = self.processor.preprocess_text(text) |
|
|
results = [] |
|
|
|
|
|
total_log_prob = 0 |
|
|
total_words = 0 |
|
|
|
|
|
for sentence in processed_sentences: |
|
|
score = self.model.score(sentence) |
|
|
words = len(sentence.split()) |
|
|
perplexity = pow(10.0, -score/words) |
|
|
|
|
|
results.append({ |
|
|
'sentence': sentence.replace('<s>', '').replace('</s>', '').strip(), |
|
|
'score': score, |
|
|
'perplexity': perplexity, |
|
|
'words': words |
|
|
}) |
|
|
|
|
|
total_log_prob += score |
|
|
total_words += words |
|
|
|
|
|
return { |
|
|
'sentence_scores': results, |
|
|
'average_log_prob': total_log_prob / total_words if total_words > 0 else 0, |
|
|
'overall_perplexity': pow(10.0, -total_log_prob / total_words) if total_words > 0 else 0 |
|
|
} |
|
|
|
|
|
|
|
|
st.set_page_config(page_title="KenLM Text Analysis", layout="wide") |
|
|
|
|
|
st.title("Advanced Text Analysis with KenLM") |
|
|
st.markdown(""" |
|
|
### About this app |
|
|
This application uses an optimized KenLM language model to analyze text quality and fluency. |
|
|
It provides detailed metrics and visualizations to help understand the text structure. |
|
|
""") |
|
|
|
|
|
|
|
|
@st.cache_resource |
|
|
def load_model(): |
|
|
return EnhancedKenLMModel("path_to_your_model.arpa") |
|
|
|
|
|
try: |
|
|
model = load_model() |
|
|
|
|
|
|
|
|
text_input = st.text_area( |
|
|
"Enter your text for analysis", |
|
|
height=150, |
|
|
placeholder="Enter the text you want to analyze..." |
|
|
) |
|
|
|
|
|
if text_input: |
|
|
with st.spinner("Analyzing text..."): |
|
|
results = model.evaluate_text(text_input) |
|
|
|
|
|
|
|
|
col1, col2 = st.columns(2) |
|
|
with col1: |
|
|
st.metric("Average Log Probability", f"{results['average_log_prob']:.4f}") |
|
|
with col2: |
|
|
st.metric("Overall Perplexity", f"{results['overall_perplexity']:.4f}") |
|
|
|
|
|
|
|
|
if results['sentence_scores']: |
|
|
df = pd.DataFrame(results['sentence_scores']) |
|
|
|
|
|
|
|
|
fig_perplexity = px.bar( |
|
|
df, |
|
|
x=df.index, |
|
|
y='perplexity', |
|
|
title="Sentence Perplexity Scores" |
|
|
) |
|
|
st.plotly_chart(fig_perplexity, use_container_width=True) |
|
|
|
|
|
|
|
|
st.subheader("Detailed Sentence Analysis") |
|
|
for idx, score in enumerate(results['sentence_scores']): |
|
|
with st.expander(f"Sentence {idx + 1}"): |
|
|
st.write(f"Text: {score['sentence']}") |
|
|
st.write(f"Score: {score['score']:.4f}") |
|
|
st.write(f"Perplexity: {score['perplexity']:.4f}") |
|
|
st.write(f"Word count: {score['words']}") |
|
|
|
|
|
except Exception as e: |
|
|
st.error(f"An error occurred: {str(e)}") |