# src/main.py import streamlit as st from utils.config import load_config from core.text_processing import TextProcessor from core.analysis import AdvancedAnalyzer from core.ui import UI def main(): config = load_config() UI.setup_page() text_processor = TextProcessor(config) analyzer = AdvancedAnalyzer(config) with st.sidebar: st.title("Analysis Settings") config['analysis']['num_topics'] = st.slider( "Number of Topics", 2, 10, config['analysis']['num_topics'] ) config['analysis']['min_entity_confidence'] = st.slider( "Entity Confidence Threshold", 0.0, 1.0, config['analysis']['min_entity_confidence'] ) st.title("Enhanced AI Output Analyzer") input_method = st.radio("Choose input method:", ["Text Input", "File Upload"]) if input_method == "File Upload": text = text_processor.process_file_upload( st.file_uploader("Upload a text file", type=['txt']) ) else: text = st.text_area("Enter text to analyze:", height=200) if st.button("Analyze", type="primary") and text_processor.validate_text( text, config['analysis']['max_text_length'] ): try: with st.spinner("Analyzing text..."): results = analyzer.analyze_text(text) UI.display_results(results) except Exception as e: st.error(f"An error occurred during analysis: {str(e)}") if __name__ == "__main__": main() # src/core/analysis.py import streamlit as st import numpy as np from nltk.tokenize import word_tokenize, sent_tokenize from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from sklearn.feature_extraction.text import CountVectorizer from sklearn.decomposition import LatentDirichletAllocation from concurrent.futures import ThreadPoolExecutor from transformers import pipeline import spacy from typing import Dict import logging logger = logging.getLogger(__name__) class TopicModeler: def __init__(self, num_topics=3): self.num_topics = num_topics self.lemmatizer = WordNetLemmatizer() self.vectorizer = CountVectorizer( max_df=0.95, min_df=2, stop_words='english', max_features=1000 ) self.lda = LatentDirichletAllocation( n_components=num_topics, random_state=42, max_iter=10 ) def preprocess_text(self, text): try: tokens = word_tokenize(text.lower()) stop_words = set(stopwords.words('english')) tokens = [ self.lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words ] return ' '.join(tokens) except Exception as e: logger.error(f"Error in text preprocessing: {str(e)}") raise def extract_topics(self, text): try: processed_text = self.preprocess_text(text) dtm = self.vectorizer.fit_transform([processed_text]) self.lda.fit(dtm) feature_names = self.vectorizer.get_feature_names_out() topics = [] for topic_idx, topic in enumerate(self.lda.components_): top_words = [ feature_names[i] for i in topic.argsort()[:-10:-1] ] topics.append({ 'id': topic_idx, 'words': top_words, 'coherence': float(np.mean(topic)) }) return topics except Exception as e: logger.error(f"Error in topic modeling: {str(e)}") raise class AdvancedAnalyzer: def __init__(self, config): self.config = config self.topic_modeler = TopicModeler() self._initialize_models() @st.cache_resource def _initialize_models(self): try: self.nlp = spacy.load(self.config['models']['spacy']) self.sentiment_model = pipeline( "sentiment-analysis", model=self.config['models']['sentiment'], return_all_scores=True ) logger.info("Models initialized successfully") except Exception as e: logger.error(f"Error initializing models: {str(e)}") raise def analyze_text(self, text: str) -> Dict: try: num_topics = self.config['analysis']['num_topics'] if num_topics != self.topic_modeler.num_topics: self.topic_modeler = TopicModeler(num_topics) results = { 'sentiment': self.analyze_sentiment_batch(text), 'topics': self.topic_modeler.extract_topics(text), 'entities': self.extract_entities(text) } return results except Exception as e: logger.error(f"Error in analysis pipeline: {str(e)}") raise def analyze_sentiment_batch(self, text: str) -> Dict: sentences = sent_tokenize(text) results = [] with ThreadPoolExecutor() as executor: futures = [ executor.submit(self.sentiment_model, sentence) for sentence in sentences ] for future in futures: try: results.append(future.result()) except Exception as e: logger.error(f"Error in sentiment analysis: {str(e)}") continue if not results: raise ValueError("No successful sentiment analysis results") # Process and aggregate results scores = np.mean([r[0]['score'] for r in results]) return { 'score': float(scores), 'label': 'positive' if scores > 0.5 else 'negative' } def extract_entities(self, text: str) -> list: doc = self.nlp(text) return [ { 'text': ent.text, 'label': ent.label_, 'confidence': float(ent._.confidence) if hasattr(ent._, 'confidence') else 1.0 } for ent in doc.ents if (hasattr(ent._, 'confidence') and ent._.confidence >= self.config['analysis']['min_entity_confidence']) ] # src/utils/config.py import yaml from pathlib import Path def load_config(): current_dir = Path(__file__).parent.parent.parent config_path = current_dir / "config.yaml" if not config_path.exists(): config = { 'models': { 'spacy': 'en_core_web_sm', 'sentiment': 'nlptown/bert-base-multilingual-uncased-sentiment' }, 'analysis': { 'batch_size': 1000, 'min_entity_confidence': 0.8, 'num_topics': 3, 'max_text_length': 50000 }, 'security': { 'max_file_size': 5242880, 'allowed_extensions': ['txt'] }, 'logging': { 'level': 'INFO', 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s' } } with open(config_path, 'w') as f: yaml.dump(config, f, default_flow_style=False) with open(config_path) as f: return yaml.safe_load(f)