shaheerawan3's picture
Update app.py
c025acd verified
# src/main.py
import streamlit as st
from utils.config import load_config
from core.text_processing import TextProcessor
from core.analysis import AdvancedAnalyzer
from core.ui import UI
def main():
config = load_config()
UI.setup_page()
text_processor = TextProcessor(config)
analyzer = AdvancedAnalyzer(config)
with st.sidebar:
st.title("Analysis Settings")
config['analysis']['num_topics'] = st.slider(
"Number of Topics", 2, 10,
config['analysis']['num_topics']
)
config['analysis']['min_entity_confidence'] = st.slider(
"Entity Confidence Threshold", 0.0, 1.0,
config['analysis']['min_entity_confidence']
)
st.title("Enhanced AI Output Analyzer")
input_method = st.radio("Choose input method:", ["Text Input", "File Upload"])
if input_method == "File Upload":
text = text_processor.process_file_upload(
st.file_uploader("Upload a text file", type=['txt'])
)
else:
text = st.text_area("Enter text to analyze:", height=200)
if st.button("Analyze", type="primary") and text_processor.validate_text(
text, config['analysis']['max_text_length']
):
try:
with st.spinner("Analyzing text..."):
results = analyzer.analyze_text(text)
UI.display_results(results)
except Exception as e:
st.error(f"An error occurred during analysis: {str(e)}")
if __name__ == "__main__":
main()
# src/core/analysis.py
import streamlit as st
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from concurrent.futures import ThreadPoolExecutor
from transformers import pipeline
import spacy
from typing import Dict
import logging
logger = logging.getLogger(__name__)
class TopicModeler:
def __init__(self, num_topics=3):
self.num_topics = num_topics
self.lemmatizer = WordNetLemmatizer()
self.vectorizer = CountVectorizer(
max_df=0.95, min_df=2,
stop_words='english', max_features=1000
)
self.lda = LatentDirichletAllocation(
n_components=num_topics,
random_state=42, max_iter=10
)
def preprocess_text(self, text):
try:
tokens = word_tokenize(text.lower())
stop_words = set(stopwords.words('english'))
tokens = [
self.lemmatizer.lemmatize(token)
for token in tokens
if token.isalnum() and token not in stop_words
]
return ' '.join(tokens)
except Exception as e:
logger.error(f"Error in text preprocessing: {str(e)}")
raise
def extract_topics(self, text):
try:
processed_text = self.preprocess_text(text)
dtm = self.vectorizer.fit_transform([processed_text])
self.lda.fit(dtm)
feature_names = self.vectorizer.get_feature_names_out()
topics = []
for topic_idx, topic in enumerate(self.lda.components_):
top_words = [
feature_names[i]
for i in topic.argsort()[:-10:-1]
]
topics.append({
'id': topic_idx,
'words': top_words,
'coherence': float(np.mean(topic))
})
return topics
except Exception as e:
logger.error(f"Error in topic modeling: {str(e)}")
raise
class AdvancedAnalyzer:
def __init__(self, config):
self.config = config
self.topic_modeler = TopicModeler()
self._initialize_models()
@st.cache_resource
def _initialize_models(self):
try:
self.nlp = spacy.load(self.config['models']['spacy'])
self.sentiment_model = pipeline(
"sentiment-analysis",
model=self.config['models']['sentiment'],
return_all_scores=True
)
logger.info("Models initialized successfully")
except Exception as e:
logger.error(f"Error initializing models: {str(e)}")
raise
def analyze_text(self, text: str) -> Dict:
try:
num_topics = self.config['analysis']['num_topics']
if num_topics != self.topic_modeler.num_topics:
self.topic_modeler = TopicModeler(num_topics)
results = {
'sentiment': self.analyze_sentiment_batch(text),
'topics': self.topic_modeler.extract_topics(text),
'entities': self.extract_entities(text)
}
return results
except Exception as e:
logger.error(f"Error in analysis pipeline: {str(e)}")
raise
def analyze_sentiment_batch(self, text: str) -> Dict:
sentences = sent_tokenize(text)
results = []
with ThreadPoolExecutor() as executor:
futures = [
executor.submit(self.sentiment_model, sentence)
for sentence in sentences
]
for future in futures:
try:
results.append(future.result())
except Exception as e:
logger.error(f"Error in sentiment analysis: {str(e)}")
continue
if not results:
raise ValueError("No successful sentiment analysis results")
# Process and aggregate results
scores = np.mean([r[0]['score'] for r in results])
return {
'score': float(scores),
'label': 'positive' if scores > 0.5 else 'negative'
}
def extract_entities(self, text: str) -> list:
doc = self.nlp(text)
return [
{
'text': ent.text,
'label': ent.label_,
'confidence': float(ent._.confidence)
if hasattr(ent._, 'confidence') else 1.0
}
for ent in doc.ents
if (hasattr(ent._, 'confidence') and
ent._.confidence >= self.config['analysis']['min_entity_confidence'])
]
# src/utils/config.py
import yaml
from pathlib import Path
def load_config():
current_dir = Path(__file__).parent.parent.parent
config_path = current_dir / "config.yaml"
if not config_path.exists():
config = {
'models': {
'spacy': 'en_core_web_sm',
'sentiment': 'nlptown/bert-base-multilingual-uncased-sentiment'
},
'analysis': {
'batch_size': 1000,
'min_entity_confidence': 0.8,
'num_topics': 3,
'max_text_length': 50000
},
'security': {
'max_file_size': 5242880,
'allowed_extensions': ['txt']
},
'logging': {
'level': 'INFO',
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
}
}
with open(config_path, 'w') as f:
yaml.dump(config, f, default_flow_style=False)
with open(config_path) as f:
return yaml.safe_load(f)