# src/main.py
import streamlit as st
from utils.config import load_config
from core.text_processing import TextProcessor
from core.analysis import AdvancedAnalyzer
from core.ui import UI

def main():
    config = load_config()
    UI.setup_page()
    
    text_processor = TextProcessor(config)
    analyzer = AdvancedAnalyzer(config)
    
    with st.sidebar:
        st.title("Analysis Settings")
        config['analysis']['num_topics'] = st.slider(
            "Number of Topics", 2, 10, 
            config['analysis']['num_topics']
        )
        config['analysis']['min_entity_confidence'] = st.slider(
            "Entity Confidence Threshold", 0.0, 1.0,
            config['analysis']['min_entity_confidence']
        )
    
    st.title("Enhanced AI Output Analyzer")
    input_method = st.radio("Choose input method:", ["Text Input", "File Upload"])
    
    if input_method == "File Upload":
        text = text_processor.process_file_upload(
            st.file_uploader("Upload a text file", type=['txt'])
        )
    else:
        text = st.text_area("Enter text to analyze:", height=200)
    
    if st.button("Analyze", type="primary") and text_processor.validate_text(
        text, config['analysis']['max_text_length']
    ):
        try:
            with st.spinner("Analyzing text..."):
                results = analyzer.analyze_text(text)
                UI.display_results(results)
        except Exception as e:
            st.error(f"An error occurred during analysis: {str(e)}")

if __name__ == "__main__":
    main()

# src/core/analysis.py
import streamlit as st
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from concurrent.futures import ThreadPoolExecutor
from transformers import pipeline
import spacy
from typing import Dict
import logging

logger = logging.getLogger(__name__)

class TopicModeler:
    def __init__(self, num_topics=3):
        self.num_topics = num_topics
        self.lemmatizer = WordNetLemmatizer()
        self.vectorizer = CountVectorizer(
            max_df=0.95, min_df=2,
            stop_words='english', max_features=1000
        )
        self.lda = LatentDirichletAllocation(
            n_components=num_topics,
            random_state=42, max_iter=10
        )
        
    def preprocess_text(self, text):
        try:
            tokens = word_tokenize(text.lower())
            stop_words = set(stopwords.words('english'))
            tokens = [
                self.lemmatizer.lemmatize(token)
                for token in tokens
                if token.isalnum() and token not in stop_words
            ]
            return ' '.join(tokens)
        except Exception as e:
            logger.error(f"Error in text preprocessing: {str(e)}")
            raise

    def extract_topics(self, text):
        try:
            processed_text = self.preprocess_text(text)
            dtm = self.vectorizer.fit_transform([processed_text])
            self.lda.fit(dtm)
            feature_names = self.vectorizer.get_feature_names_out()
            
            topics = []
            for topic_idx, topic in enumerate(self.lda.components_):
                top_words = [
                    feature_names[i]
                    for i in topic.argsort()[:-10:-1]
                ]
                topics.append({
                    'id': topic_idx,
                    'words': top_words,
                    'coherence': float(np.mean(topic))
                })
            return topics
        except Exception as e:
            logger.error(f"Error in topic modeling: {str(e)}")
            raise

class AdvancedAnalyzer:
    def __init__(self, config):
        self.config = config
        self.topic_modeler = TopicModeler()
        self._initialize_models()
    
    @st.cache_resource
    def _initialize_models(self):
        try:
            self.nlp = spacy.load(self.config['models']['spacy'])
            self.sentiment_model = pipeline(
                "sentiment-analysis",
                model=self.config['models']['sentiment'],
                return_all_scores=True
            )
            logger.info("Models initialized successfully")
        except Exception as e:
            logger.error(f"Error initializing models: {str(e)}")
            raise

    def analyze_text(self, text: str) -> Dict:
        try:
            num_topics = self.config['analysis']['num_topics']
            if num_topics != self.topic_modeler.num_topics:
                self.topic_modeler = TopicModeler(num_topics)
            
            results = {
                'sentiment': self.analyze_sentiment_batch(text),
                'topics': self.topic_modeler.extract_topics(text),
                'entities': self.extract_entities(text)
            }
            return results
        except Exception as e:
            logger.error(f"Error in analysis pipeline: {str(e)}")
            raise

    def analyze_sentiment_batch(self, text: str) -> Dict:
        sentences = sent_tokenize(text)
        results = []
        
        with ThreadPoolExecutor() as executor:
            futures = [
                executor.submit(self.sentiment_model, sentence)
                for sentence in sentences
            ]
            for future in futures:
                try:
                    results.append(future.result())
                except Exception as e:
                    logger.error(f"Error in sentiment analysis: {str(e)}")
                    continue
        
        if not results:
            raise ValueError("No successful sentiment analysis results")
        
        # Process and aggregate results
        scores = np.mean([r[0]['score'] for r in results])
        return {
            'score': float(scores),
            'label': 'positive' if scores > 0.5 else 'negative'
        }

    def extract_entities(self, text: str) -> list:
        doc = self.nlp(text)
        return [
            {
                'text': ent.text,
                'label': ent.label_,
                'confidence': float(ent._.confidence)
                if hasattr(ent._, 'confidence') else 1.0
            }
            for ent in doc.ents
            if (hasattr(ent._, 'confidence') and 
                ent._.confidence >= self.config['analysis']['min_entity_confidence'])
        ]

# src/utils/config.py
import yaml
from pathlib import Path

def load_config():
    current_dir = Path(__file__).parent.parent.parent
    config_path = current_dir / "config.yaml"
    
    if not config_path.exists():
        config = {
            'models': {
                'spacy': 'en_core_web_sm',
                'sentiment': 'nlptown/bert-base-multilingual-uncased-sentiment'
            },
            'analysis': {
                'batch_size': 1000,
                'min_entity_confidence': 0.8,
                'num_topics': 3,
                'max_text_length': 50000
            },
            'security': {
                'max_file_size': 5242880,
                'allowed_extensions': ['txt']
            },
            'logging': {
                'level': 'INFO',
                'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
            }
        }
        
        with open(config_path, 'w') as f:
            yaml.dump(config, f, default_flow_style=False)
    
    with open(config_path) as f:
        return yaml.safe_load(f)