Spaces:

shaheerawan3
/

AI_OutputAnalyzer

Sleeping

App Files Files Community

AI_OutputAnalyzer / app.py

shaheerawan3

Update app.py

c025acd verified about 1 year ago

raw

history blame contribute delete

7.56 kB

	# src/main.py
	import streamlit as st
	from utils.config import load_config
	from core.text_processing import TextProcessor
	from core.analysis import AdvancedAnalyzer
	from core.ui import UI

	def main():
	config = load_config()
	UI.setup_page()

	text_processor = TextProcessor(config)
	analyzer = AdvancedAnalyzer(config)

	with st.sidebar:
	st.title("Analysis Settings")
	config['analysis']['num_topics'] = st.slider(
	"Number of Topics", 2, 10,
	config['analysis']['num_topics']
	)
	config['analysis']['min_entity_confidence'] = st.slider(
	"Entity Confidence Threshold", 0.0, 1.0,
	config['analysis']['min_entity_confidence']
	)

	st.title("Enhanced AI Output Analyzer")
	input_method = st.radio("Choose input method:", ["Text Input", "File Upload"])

	if input_method == "File Upload":
	text = text_processor.process_file_upload(
	st.file_uploader("Upload a text file", type=['txt'])
	)
	else:
	text = st.text_area("Enter text to analyze:", height=200)

	if st.button("Analyze", type="primary") and text_processor.validate_text(
	text, config['analysis']['max_text_length']
	):
	try:
	with st.spinner("Analyzing text..."):
	results = analyzer.analyze_text(text)
	UI.display_results(results)
	except Exception as e:
	st.error(f"An error occurred during analysis: {str(e)}")

	if __name__ == "__main__":
	main()

	# src/core/analysis.py
	import streamlit as st
	import numpy as np
	from nltk.tokenize import word_tokenize, sent_tokenize
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.decomposition import LatentDirichletAllocation
	from concurrent.futures import ThreadPoolExecutor
	from transformers import pipeline
	import spacy
	from typing import Dict
	import logging

	logger = logging.getLogger(__name__)

	class TopicModeler:
	def __init__(self, num_topics=3):
	self.num_topics = num_topics
	self.lemmatizer = WordNetLemmatizer()
	self.vectorizer = CountVectorizer(
	max_df=0.95, min_df=2,
	stop_words='english', max_features=1000
	)
	self.lda = LatentDirichletAllocation(
	n_components=num_topics,
	random_state=42, max_iter=10
	)

	def preprocess_text(self, text):
	try:
	tokens = word_tokenize(text.lower())
	stop_words = set(stopwords.words('english'))
	tokens = [
	self.lemmatizer.lemmatize(token)
	for token in tokens
	if token.isalnum() and token not in stop_words
	]
	return ' '.join(tokens)
	except Exception as e:
	logger.error(f"Error in text preprocessing: {str(e)}")
	raise

	def extract_topics(self, text):
	try:
	processed_text = self.preprocess_text(text)
	dtm = self.vectorizer.fit_transform([processed_text])
	self.lda.fit(dtm)
	feature_names = self.vectorizer.get_feature_names_out()

	topics = []
	for topic_idx, topic in enumerate(self.lda.components_):
	top_words = [
	feature_names[i]
	for i in topic.argsort()[:-10:-1]
	]
	topics.append({
	'id': topic_idx,
	'words': top_words,
	'coherence': float(np.mean(topic))
	})
	return topics
	except Exception as e:
	logger.error(f"Error in topic modeling: {str(e)}")
	raise

	class AdvancedAnalyzer:
	def __init__(self, config):
	self.config = config
	self.topic_modeler = TopicModeler()
	self._initialize_models()

	@st.cache_resource
	def _initialize_models(self):
	try:
	self.nlp = spacy.load(self.config['models']['spacy'])
	self.sentiment_model = pipeline(
	"sentiment-analysis",
	model=self.config['models']['sentiment'],
	return_all_scores=True
	)
	logger.info("Models initialized successfully")
	except Exception as e:
	logger.error(f"Error initializing models: {str(e)}")
	raise

	def analyze_text(self, text: str) -> Dict:
	try:
	num_topics = self.config['analysis']['num_topics']
	if num_topics != self.topic_modeler.num_topics:
	self.topic_modeler = TopicModeler(num_topics)

	results = {
	'sentiment': self.analyze_sentiment_batch(text),
	'topics': self.topic_modeler.extract_topics(text),
	'entities': self.extract_entities(text)
	}
	return results
	except Exception as e:
	logger.error(f"Error in analysis pipeline: {str(e)}")
	raise

	def analyze_sentiment_batch(self, text: str) -> Dict:
	sentences = sent_tokenize(text)
	results = []

	with ThreadPoolExecutor() as executor:
	futures = [
	executor.submit(self.sentiment_model, sentence)
	for sentence in sentences
	]
	for future in futures:
	try:
	results.append(future.result())
	except Exception as e:
	logger.error(f"Error in sentiment analysis: {str(e)}")
	continue

	if not results:
	raise ValueError("No successful sentiment analysis results")

	# Process and aggregate results
	scores = np.mean([r[0]['score'] for r in results])
	return {
	'score': float(scores),
	'label': 'positive' if scores > 0.5 else 'negative'
	}

	def extract_entities(self, text: str) -> list:
	doc = self.nlp(text)
	return [
	{
	'text': ent.text,
	'label': ent.label_,
	'confidence': float(ent._.confidence)
	if hasattr(ent._, 'confidence') else 1.0
	}
	for ent in doc.ents
	if (hasattr(ent._, 'confidence') and
	ent._.confidence >= self.config['analysis']['min_entity_confidence'])
	]

	# src/utils/config.py
	import yaml
	from pathlib import Path

	def load_config():
	current_dir = Path(__file__).parent.parent.parent
	config_path = current_dir / "config.yaml"

	if not config_path.exists():
	config = {
	'models': {
	'spacy': 'en_core_web_sm',
	'sentiment': 'nlptown/bert-base-multilingual-uncased-sentiment'
	},
	'analysis': {
	'batch_size': 1000,
	'min_entity_confidence': 0.8,
	'num_topics': 3,
	'max_text_length': 50000
	},
	'security': {
	'max_file_size': 5242880,
	'allowed_extensions': ['txt']
	},
	'logging': {
	'level': 'INFO',
	'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
	}
	}

	with open(config_path, 'w') as f:
	yaml.dump(config, f, default_flow_style=False)

	with open(config_path) as f:
	return yaml.safe_load(f)