# app.py
import streamlit as st
import kenlm
import pandas as pd
import plotly.express as px
from pathlib import Path
import torch
from datasets import load_dataset
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

class OptimizedTextProcessor:
    def __init__(self):
        self.nltk_downloaded = False
    
    def preprocess_text(self, text: str) -> list:
        if not self.nltk_downloaded:
            nltk.download('punkt')
            self.nltk_downloaded = True
            
        # Better sentence splitting using NLTK
        sentences = sent_tokenize(text.lower())
        processed_sentences = []
        
        for sentence in sentences:
            # Enhanced cleaning while preserving sentence structure
            cleaned = ' '.join(
                word.strip() 
                for word in sentence.split() 
                if any(c.isalnum() for c in word)
            )
            if cleaned:
                processed_sentences.append(f"<s> {cleaned} </s>")
                
        return processed_sentences

class EnhancedKenLMModel:
    def __init__(self, model_path: str):
        self.model = kenlm.Model(model_path)
        self.processor = OptimizedTextProcessor()
        
    def evaluate_text(self, text: str) -> dict:
        processed_sentences = self.processor.preprocess_text(text)
        results = []
        
        total_log_prob = 0
        total_words = 0
        
        for sentence in processed_sentences:
            score = self.model.score(sentence)
            words = len(sentence.split())
            perplexity = pow(10.0, -score/words)
            
            results.append({
                'sentence': sentence.replace('<s>', '').replace('</s>', '').strip(),
                'score': score,
                'perplexity': perplexity,
                'words': words
            })
            
            total_log_prob += score
            total_words += words
            
        return {
            'sentence_scores': results,
            'average_log_prob': total_log_prob / total_words if total_words > 0 else 0,
            'overall_perplexity': pow(10.0, -total_log_prob / total_words) if total_words > 0 else 0
        }

# Streamlit UI
st.set_page_config(page_title="KenLM Text Analysis", layout="wide")

st.title("Advanced Text Analysis with KenLM")
st.markdown("""
### About this app
This application uses an optimized KenLM language model to analyze text quality and fluency. 
It provides detailed metrics and visualizations to help understand the text structure.
""")

# Initialize model
@st.cache_resource
def load_model():
    return EnhancedKenLMModel("path_to_your_model.arpa")

try:
    model = load_model()
    
    # Text input
    text_input = st.text_area(
        "Enter your text for analysis",
        height=150,
        placeholder="Enter the text you want to analyze..."
    )
    
    if text_input:
        with st.spinner("Analyzing text..."):
            results = model.evaluate_text(text_input)
            
            # Display overall metrics
            col1, col2 = st.columns(2)
            with col1:
                st.metric("Average Log Probability", f"{results['average_log_prob']:.4f}")
            with col2:
                st.metric("Overall Perplexity", f"{results['overall_perplexity']:.4f}")
            
            # Create visualization
            if results['sentence_scores']:
                df = pd.DataFrame(results['sentence_scores'])
                
                # Perplexity plot
                fig_perplexity = px.bar(
                    df,
                    x=df.index,
                    y='perplexity',
                    title="Sentence Perplexity Scores"
                )
                st.plotly_chart(fig_perplexity, use_container_width=True)
                
                # Detailed sentence analysis
                st.subheader("Detailed Sentence Analysis")
                for idx, score in enumerate(results['sentence_scores']):
                    with st.expander(f"Sentence {idx + 1}"):
                        st.write(f"Text: {score['sentence']}")
                        st.write(f"Score: {score['score']:.4f}")
                        st.write(f"Perplexity: {score['perplexity']:.4f}")
                        st.write(f"Word count: {score['words']}")

except Exception as e:
    st.error(f"An error occurred: {str(e)}")