Spaces:
Sleeping
Sleeping
| # src/main.py | |
| import streamlit as st | |
| from utils.config import load_config | |
| from core.text_processing import TextProcessor | |
| from core.analysis import AdvancedAnalyzer | |
| from core.ui import UI | |
| def main(): | |
| config = load_config() | |
| UI.setup_page() | |
| text_processor = TextProcessor(config) | |
| analyzer = AdvancedAnalyzer(config) | |
| with st.sidebar: | |
| st.title("Analysis Settings") | |
| config['analysis']['num_topics'] = st.slider( | |
| "Number of Topics", 2, 10, | |
| config['analysis']['num_topics'] | |
| ) | |
| config['analysis']['min_entity_confidence'] = st.slider( | |
| "Entity Confidence Threshold", 0.0, 1.0, | |
| config['analysis']['min_entity_confidence'] | |
| ) | |
| st.title("Enhanced AI Output Analyzer") | |
| input_method = st.radio("Choose input method:", ["Text Input", "File Upload"]) | |
| if input_method == "File Upload": | |
| text = text_processor.process_file_upload( | |
| st.file_uploader("Upload a text file", type=['txt']) | |
| ) | |
| else: | |
| text = st.text_area("Enter text to analyze:", height=200) | |
| if st.button("Analyze", type="primary") and text_processor.validate_text( | |
| text, config['analysis']['max_text_length'] | |
| ): | |
| try: | |
| with st.spinner("Analyzing text..."): | |
| results = analyzer.analyze_text(text) | |
| UI.display_results(results) | |
| except Exception as e: | |
| st.error(f"An error occurred during analysis: {str(e)}") | |
| if __name__ == "__main__": | |
| main() | |
| # src/core/analysis.py | |
| import streamlit as st | |
| import numpy as np | |
| from nltk.tokenize import word_tokenize, sent_tokenize | |
| from nltk.corpus import stopwords | |
| from nltk.stem import WordNetLemmatizer | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from sklearn.decomposition import LatentDirichletAllocation | |
| from concurrent.futures import ThreadPoolExecutor | |
| from transformers import pipeline | |
| import spacy | |
| from typing import Dict | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| class TopicModeler: | |
| def __init__(self, num_topics=3): | |
| self.num_topics = num_topics | |
| self.lemmatizer = WordNetLemmatizer() | |
| self.vectorizer = CountVectorizer( | |
| max_df=0.95, min_df=2, | |
| stop_words='english', max_features=1000 | |
| ) | |
| self.lda = LatentDirichletAllocation( | |
| n_components=num_topics, | |
| random_state=42, max_iter=10 | |
| ) | |
| def preprocess_text(self, text): | |
| try: | |
| tokens = word_tokenize(text.lower()) | |
| stop_words = set(stopwords.words('english')) | |
| tokens = [ | |
| self.lemmatizer.lemmatize(token) | |
| for token in tokens | |
| if token.isalnum() and token not in stop_words | |
| ] | |
| return ' '.join(tokens) | |
| except Exception as e: | |
| logger.error(f"Error in text preprocessing: {str(e)}") | |
| raise | |
| def extract_topics(self, text): | |
| try: | |
| processed_text = self.preprocess_text(text) | |
| dtm = self.vectorizer.fit_transform([processed_text]) | |
| self.lda.fit(dtm) | |
| feature_names = self.vectorizer.get_feature_names_out() | |
| topics = [] | |
| for topic_idx, topic in enumerate(self.lda.components_): | |
| top_words = [ | |
| feature_names[i] | |
| for i in topic.argsort()[:-10:-1] | |
| ] | |
| topics.append({ | |
| 'id': topic_idx, | |
| 'words': top_words, | |
| 'coherence': float(np.mean(topic)) | |
| }) | |
| return topics | |
| except Exception as e: | |
| logger.error(f"Error in topic modeling: {str(e)}") | |
| raise | |
| class AdvancedAnalyzer: | |
| def __init__(self, config): | |
| self.config = config | |
| self.topic_modeler = TopicModeler() | |
| self._initialize_models() | |
| def _initialize_models(self): | |
| try: | |
| self.nlp = spacy.load(self.config['models']['spacy']) | |
| self.sentiment_model = pipeline( | |
| "sentiment-analysis", | |
| model=self.config['models']['sentiment'], | |
| return_all_scores=True | |
| ) | |
| logger.info("Models initialized successfully") | |
| except Exception as e: | |
| logger.error(f"Error initializing models: {str(e)}") | |
| raise | |
| def analyze_text(self, text: str) -> Dict: | |
| try: | |
| num_topics = self.config['analysis']['num_topics'] | |
| if num_topics != self.topic_modeler.num_topics: | |
| self.topic_modeler = TopicModeler(num_topics) | |
| results = { | |
| 'sentiment': self.analyze_sentiment_batch(text), | |
| 'topics': self.topic_modeler.extract_topics(text), | |
| 'entities': self.extract_entities(text) | |
| } | |
| return results | |
| except Exception as e: | |
| logger.error(f"Error in analysis pipeline: {str(e)}") | |
| raise | |
| def analyze_sentiment_batch(self, text: str) -> Dict: | |
| sentences = sent_tokenize(text) | |
| results = [] | |
| with ThreadPoolExecutor() as executor: | |
| futures = [ | |
| executor.submit(self.sentiment_model, sentence) | |
| for sentence in sentences | |
| ] | |
| for future in futures: | |
| try: | |
| results.append(future.result()) | |
| except Exception as e: | |
| logger.error(f"Error in sentiment analysis: {str(e)}") | |
| continue | |
| if not results: | |
| raise ValueError("No successful sentiment analysis results") | |
| # Process and aggregate results | |
| scores = np.mean([r[0]['score'] for r in results]) | |
| return { | |
| 'score': float(scores), | |
| 'label': 'positive' if scores > 0.5 else 'negative' | |
| } | |
| def extract_entities(self, text: str) -> list: | |
| doc = self.nlp(text) | |
| return [ | |
| { | |
| 'text': ent.text, | |
| 'label': ent.label_, | |
| 'confidence': float(ent._.confidence) | |
| if hasattr(ent._, 'confidence') else 1.0 | |
| } | |
| for ent in doc.ents | |
| if (hasattr(ent._, 'confidence') and | |
| ent._.confidence >= self.config['analysis']['min_entity_confidence']) | |
| ] | |
| # src/utils/config.py | |
| import yaml | |
| from pathlib import Path | |
| def load_config(): | |
| current_dir = Path(__file__).parent.parent.parent | |
| config_path = current_dir / "config.yaml" | |
| if not config_path.exists(): | |
| config = { | |
| 'models': { | |
| 'spacy': 'en_core_web_sm', | |
| 'sentiment': 'nlptown/bert-base-multilingual-uncased-sentiment' | |
| }, | |
| 'analysis': { | |
| 'batch_size': 1000, | |
| 'min_entity_confidence': 0.8, | |
| 'num_topics': 3, | |
| 'max_text_length': 50000 | |
| }, | |
| 'security': { | |
| 'max_file_size': 5242880, | |
| 'allowed_extensions': ['txt'] | |
| }, | |
| 'logging': { | |
| 'level': 'INFO', | |
| 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s' | |
| } | |
| } | |
| with open(config_path, 'w') as f: | |
| yaml.dump(config, f, default_flow_style=False) | |
| with open(config_path) as f: | |
| return yaml.safe_load(f) |