Spaces:

V8055
/

kenlmApp

Runtime error

App Files Files Community

kenlmApp / app.py

V8055

Update app.py

4ede509 verified about 1 year ago

raw

history blame contribute delete

4.42 kB

	# app.py
	import streamlit as st
	import kenlm
	import pandas as pd
	import plotly.express as px
	from pathlib import Path
	import torch
	from datasets import load_dataset
	import nltk
	from nltk.tokenize import sent_tokenize
	nltk.download('punkt')

	class OptimizedTextProcessor:
	def __init__(self):
	self.nltk_downloaded = False

	def preprocess_text(self, text: str) -> list:
	if not self.nltk_downloaded:
	nltk.download('punkt')
	self.nltk_downloaded = True

	# Better sentence splitting using NLTK
	sentences = sent_tokenize(text.lower())
	processed_sentences = []

	for sentence in sentences:
	# Enhanced cleaning while preserving sentence structure
	cleaned = ' '.join(
	word.strip()
	for word in sentence.split()
	if any(c.isalnum() for c in word)
	)
	if cleaned:
	processed_sentences.append(f"<s> {cleaned} </s>")

	return processed_sentences

	class EnhancedKenLMModel:
	def __init__(self, model_path: str):
	self.model = kenlm.Model(model_path)
	self.processor = OptimizedTextProcessor()

	def evaluate_text(self, text: str) -> dict:
	processed_sentences = self.processor.preprocess_text(text)
	results = []

	total_log_prob = 0
	total_words = 0

	for sentence in processed_sentences:
	score = self.model.score(sentence)
	words = len(sentence.split())
	perplexity = pow(10.0, -score/words)

	results.append({
	'sentence': sentence.replace('<s>', '').replace('</s>', '').strip(),
	'score': score,
	'perplexity': perplexity,
	'words': words
	})

	total_log_prob += score
	total_words += words

	return {
	'sentence_scores': results,
	'average_log_prob': total_log_prob / total_words if total_words > 0 else 0,
	'overall_perplexity': pow(10.0, -total_log_prob / total_words) if total_words > 0 else 0
	}

	# Streamlit UI
	st.set_page_config(page_title="KenLM Text Analysis", layout="wide")

	st.title("Advanced Text Analysis with KenLM")
	st.markdown("""
	### About this app
	This application uses an optimized KenLM language model to analyze text quality and fluency.
	It provides detailed metrics and visualizations to help understand the text structure.
	""")

	# Initialize model
	@st.cache_resource
	def load_model():
	return EnhancedKenLMModel("path_to_your_model.arpa")

	try:
	model = load_model()

	# Text input
	text_input = st.text_area(
	"Enter your text for analysis",
	height=150,
	placeholder="Enter the text you want to analyze..."
	)

	if text_input:
	with st.spinner("Analyzing text..."):
	results = model.evaluate_text(text_input)

	# Display overall metrics
	col1, col2 = st.columns(2)
	with col1:
	st.metric("Average Log Probability", f"{results['average_log_prob']:.4f}")
	with col2:
	st.metric("Overall Perplexity", f"{results['overall_perplexity']:.4f}")

	# Create visualization
	if results['sentence_scores']:
	df = pd.DataFrame(results['sentence_scores'])

	# Perplexity plot
	fig_perplexity = px.bar(
	df,
	x=df.index,
	y='perplexity',
	title="Sentence Perplexity Scores"
	)
	st.plotly_chart(fig_perplexity, use_container_width=True)

	# Detailed sentence analysis
	st.subheader("Detailed Sentence Analysis")
	for idx, score in enumerate(results['sentence_scores']):
	with st.expander(f"Sentence {idx + 1}"):
	st.write(f"Text: {score['sentence']}")
	st.write(f"Score: {score['score']:.4f}")
	st.write(f"Perplexity: {score['perplexity']:.4f}")
	st.write(f"Word count: {score['words']}")

	except Exception as e:
	st.error(f"An error occurred: {str(e)}")