from typing import List, Dict import requests from bs4 import BeautifulSoup from transformers import pipeline from gtts import gTTS from keybert import KeyBERT from deep_translator import GoogleTranslator import base64 from io import BytesIO from sentence_transformers import SentenceTransformer from pathlib import Path import tempfile # Initialize models sentiment_pipeline = pipeline( "sentiment-analysis", model="distilbert/distilbert-base-uncased-finetuned-sst-2-english", ) summary_pipeline = pipeline( "summarization", model="facebook/bart-large-cnn", ) kw_model = KeyBERT() def extract_articles(company: str) -> List[Dict]: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } url = f"https://www.indiatvnews.com/topic/{company}" response = requests.get(url, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.content, "html.parser") articles = [] li_elements = soup.select('li.eventTracking')[:5] for i in li_elements: link = i.find('a', class_='thumb') request_internal = requests.get(link['href']) soup_internal = BeautifulSoup(request_internal.content, "html.parser") p_elements = soup_internal.select('p:not(:has(*))') merged_text = ' '.join(p.get_text(strip=True) for p in p_elements) summary_text = summary_pipeline(merged_text, max_length=50, min_length=20, do_sample=False) summary = summary_text[0]['summary_text'] topics = kw_model.extract_keywords(summary, keyphrase_ngram_range=(1, 2), stop_words='english') topic_list = [topic[0] for topic in topics] articles.append({ "Title": i['title'], "Summary": summary, "URL": link['href'], "Topics": topic_list }) return articles def analyze_sentiment(text: str) -> Dict: if not text or text == 'No Summary Available': return {"label": "NEUTRAL", "score": 0.5} result = sentiment_pipeline(text)[0] return result def perform_comparative_analysis(articles): sentiment_counts = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0} common_topics = set(articles[0]['Topics']) unique_topics = [set(article['Topics']) for article in articles] coverage_differences = [] for i in range(len(articles) - 1): for j in range(i + 1, len(articles)): diff = set(articles[i]['Topics']) ^ set(articles[j]['Topics']) coverage_differences.append({ "Comparison": f"{articles[i]['Title']} vs {articles[j]['Title']}", "Unique Topics": list(diff) }) common_topics &= set(articles[j]['Topics']) topic_overlap = { "Common Topics": list(common_topics), "Unique Topics": [list(topics - common_topics) for topics in unique_topics] } for article in articles: sentiment_result = analyze_sentiment(article['Summary']) article['Sentiment'] = sentiment_result['label'] sentiment_counts[sentiment_result['label']] += 1 return sentiment_counts, coverage_differences, topic_overlap # Add validation in your backend's generate_hindi_tts function def generate_hindi_tts(text: str) -> str: temp_dir = tempfile.mkdtemp() try: # Validate input if not text or len(text.strip()) < 50: return "" # Translation translated = GoogleTranslator(source='en', target='hi').translate(text[:1000]) print(f"Translated text: {translated}") if not translated: return "" # Create temp file temp_file = Path(temp_dir) / "hindi_summary.wav" # Generate audio tts = gTTS(translated, lang='hi') tts.save(temp_file) # Read and encode with open(temp_file, "rb") as f: audio_bytes = f.read() return base64.b64encode(audio_bytes).decode('utf-8') except Exception as e: print(f"Audio Generation Error: {str(e)}") return "" finally: # Cleanup - remove temp directory and contents try: for file in Path(temp_dir).glob("*"): file.unlink() Path(temp_dir).rmdir() except Exception as cleanup_error: print(f"Cleanup error: {str(cleanup_error)}")