Upload 30 files
Browse files- .gitattributes +20 -0
- .gitignore +50 -0
- Dockerfile +49 -0
- README.md +143 -12
- Spacefile +8 -0
- api.py +332 -0
- app.py +496 -0
- generate_json_output.py +55 -0
- healthcheck.py +171 -0
- requirements.txt +39 -0
- temp_audio_0120e65155dd4f5c8d53a8b7c49336f4.mp3 +3 -0
- temp_audio_0a8a80f832c0405797b7d6475f9e6046.mp3 +3 -0
- temp_audio_0c68dc54f23f4498937d6696f9245651.mp3 +3 -0
- temp_audio_2888808c976c4d658e5e0dfa52370f0c.mp3 +3 -0
- temp_audio_2ac3161769b44d67883f8c68bd68828a.mp3 +3 -0
- temp_audio_2d1949225eb4436e963e7e46db963bf9.mp3 +3 -0
- temp_audio_477f1b5f9c5a49bf956a0c1771ec41d8.mp3 +3 -0
- temp_audio_531e8b49f3fc4d53b3c99c4cbfff71ae.mp3 +3 -0
- temp_audio_55fa301d954b442e9ce58d21be70ed2d.mp3 +3 -0
- temp_audio_59225e2a079d4cfaa0c112206f4e14bb.mp3 +3 -0
- temp_audio_629b84e1f9f44706a7e4082b2b309b07.mp3 +3 -0
- temp_audio_84748717c8134d868dcc7633de098f15.mp3 +3 -0
- temp_audio_8e113f3e7933446291ab390f892b2345.mp3 +3 -0
- temp_audio_aaecc4127abd497d988be047fba22731.mp3 +3 -0
- temp_audio_acd4f5f81a244c0ba4db27bddd0801d5.mp3 +3 -0
- temp_audio_aec8c57313224da590f3447c79a77b89.mp3 +3 -0
- temp_audio_c8c07565663d4c69bd740dc7954a921a.mp3 +3 -0
- temp_audio_d322c8c906234dc98675709f214ed49c.mp3 +3 -0
- temp_audio_f0761eacc6b34a14910b39c7ee84b19d.mp3 +3 -0
- temp_audio_f63cf8928c004b7d8a0cfaffebf44614.mp3 +3 -0
- utils.py +1132 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,23 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
temp_audio_0120e65155dd4f5c8d53a8b7c49336f4.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
temp_audio_0a8a80f832c0405797b7d6475f9e6046.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
temp_audio_0c68dc54f23f4498937d6696f9245651.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
temp_audio_2888808c976c4d658e5e0dfa52370f0c.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
temp_audio_2ac3161769b44d67883f8c68bd68828a.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
temp_audio_2d1949225eb4436e963e7e46db963bf9.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
temp_audio_477f1b5f9c5a49bf956a0c1771ec41d8.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
temp_audio_531e8b49f3fc4d53b3c99c4cbfff71ae.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
temp_audio_55fa301d954b442e9ce58d21be70ed2d.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
temp_audio_59225e2a079d4cfaa0c112206f4e14bb.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
temp_audio_629b84e1f9f44706a7e4082b2b309b07.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
temp_audio_84748717c8134d868dcc7633de098f15.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
temp_audio_8e113f3e7933446291ab390f892b2345.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
temp_audio_aaecc4127abd497d988be047fba22731.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
temp_audio_acd4f5f81a244c0ba4db27bddd0801d5.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
temp_audio_aec8c57313224da590f3447c79a77b89.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
temp_audio_c8c07565663d4c69bd740dc7954a921a.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
temp_audio_d322c8c906234dc98675709f214ed49c.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
temp_audio_f0761eacc6b34a14910b39c7ee84b19d.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
temp_audio_f63cf8928c004b7d8a0cfaffebf44614.mp3 filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
build/
|
| 8 |
+
develop-eggs/
|
| 9 |
+
dist/
|
| 10 |
+
downloads/
|
| 11 |
+
eggs/
|
| 12 |
+
.eggs/
|
| 13 |
+
lib/
|
| 14 |
+
lib64/
|
| 15 |
+
parts/
|
| 16 |
+
sdist/
|
| 17 |
+
var/
|
| 18 |
+
wheels/
|
| 19 |
+
*.egg-info/
|
| 20 |
+
.installed.cfg
|
| 21 |
+
*.egg
|
| 22 |
+
|
| 23 |
+
# Virtual Environment
|
| 24 |
+
venv/
|
| 25 |
+
ENV/
|
| 26 |
+
env/
|
| 27 |
+
.env
|
| 28 |
+
|
| 29 |
+
# IDE
|
| 30 |
+
.idea/
|
| 31 |
+
.vscode/
|
| 32 |
+
*.swp
|
| 33 |
+
*.swo
|
| 34 |
+
.DS_Store
|
| 35 |
+
|
| 36 |
+
# Logs
|
| 37 |
+
logs/
|
| 38 |
+
*.log
|
| 39 |
+
|
| 40 |
+
# Audio files
|
| 41 |
+
audio_files/
|
| 42 |
+
*.mp3
|
| 43 |
+
*.wav
|
| 44 |
+
|
| 45 |
+
# Jupyter
|
| 46 |
+
.ipynb_checkpoints
|
| 47 |
+
|
| 48 |
+
# Model caches
|
| 49 |
+
.cache/
|
| 50 |
+
.local/
|
Dockerfile
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.9-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Install dependencies
|
| 6 |
+
COPY requirements.txt .
|
| 7 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 8 |
+
|
| 9 |
+
# Install additional dependencies needed for NLP tasks and TTS
|
| 10 |
+
RUN apt-get update && apt-get install -y \
|
| 11 |
+
build-essential \
|
| 12 |
+
curl \
|
| 13 |
+
software-properties-common \
|
| 14 |
+
git \
|
| 15 |
+
ffmpeg \
|
| 16 |
+
espeak \
|
| 17 |
+
libespeak-dev \
|
| 18 |
+
alsa-utils \
|
| 19 |
+
python3-pyaudio \
|
| 20 |
+
libasound2-dev \
|
| 21 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 22 |
+
|
| 23 |
+
# Copy app files
|
| 24 |
+
COPY . .
|
| 25 |
+
|
| 26 |
+
# Create directory for audio files
|
| 27 |
+
RUN mkdir -p audio_files
|
| 28 |
+
|
| 29 |
+
# Set environment variables
|
| 30 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 31 |
+
ENV PYTHONUNBUFFERED=1
|
| 32 |
+
|
| 33 |
+
# Download NLTK data
|
| 34 |
+
RUN python -c "import nltk; nltk.download('punkt'); nltk.download('stopwords')"
|
| 35 |
+
|
| 36 |
+
# Expose ports
|
| 37 |
+
EXPOSE 8000
|
| 38 |
+
EXPOSE 8501
|
| 39 |
+
|
| 40 |
+
# Create a shell script to run both services
|
| 41 |
+
RUN echo '#!/bin/bash\n\
|
| 42 |
+
uvicorn api:app --host 0.0.0.0 --port 8000 &\n\
|
| 43 |
+
streamlit run app.py --server.port 8501 --server.address 0.0.0.0\n'\
|
| 44 |
+
> /app/start.sh
|
| 45 |
+
|
| 46 |
+
RUN chmod +x /app/start.sh
|
| 47 |
+
|
| 48 |
+
# Start the application
|
| 49 |
+
CMD ["/app/start.sh"]
|
README.md
CHANGED
|
@@ -1,12 +1,143 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# News Summarization and Text-to-Speech Application
|
| 2 |
+
|
| 3 |
+
A web-based application that extracts news articles related to companies, performs sentiment analysis, conducts comparative analysis, and generates a text-to-speech output in Hindi.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
- **News Extraction**: Scrapes at least 10 unique news articles about a given company using BeautifulSoup
|
| 8 |
+
- **Sentiment Analysis**: Analyzes the sentiment of each article (positive, negative, neutral)
|
| 9 |
+
- **Comparative Analysis**: Compares sentiment across articles to derive insights
|
| 10 |
+
- **Text-to-Speech**: Converts summarized content to Hindi speech
|
| 11 |
+
- **User Interface**: Simple web interface built with Streamlit
|
| 12 |
+
- **API Communication**: Backend and frontend communicate through APIs
|
| 13 |
+
|
| 14 |
+
## Project Structure
|
| 15 |
+
|
| 16 |
+
```
|
| 17 |
+
.
|
| 18 |
+
├── app.py # Main Streamlit application
|
| 19 |
+
├── api.py # API endpoints
|
| 20 |
+
├── utils.py # Utility functions for scraping, sentiment analysis, etc.
|
| 21 |
+
├── healthcheck.py # Script to verify all dependencies and services
|
| 22 |
+
├── requirements.txt # Project dependencies
|
| 23 |
+
├── Dockerfile # Docker configuration for deployment
|
| 24 |
+
├── Spacefile # Hugging Face Spaces configuration
|
| 25 |
+
└── README.md # Project documentation
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
## Setup Instructions
|
| 29 |
+
|
| 30 |
+
1. **Clone the repository**:
|
| 31 |
+
```
|
| 32 |
+
git clone https://github.com/yourusername/news-summarization-tts.git
|
| 33 |
+
cd news-summarization-tts
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
2. **Create a virtual environment** (recommended):
|
| 37 |
+
```
|
| 38 |
+
python -m venv venv
|
| 39 |
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
3. **Install dependencies**:
|
| 43 |
+
```
|
| 44 |
+
pip install -r requirements.txt
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
4. **Install system dependencies** (for text-to-speech functionality):
|
| 48 |
+
- On Ubuntu/Debian:
|
| 49 |
+
```
|
| 50 |
+
sudo apt-get install espeak ffmpeg
|
| 51 |
+
```
|
| 52 |
+
- On Windows:
|
| 53 |
+
Download and install espeak from http://espeak.sourceforge.net/download.html
|
| 54 |
+
|
| 55 |
+
5. **Run the healthcheck** (to verify all dependencies are working):
|
| 56 |
+
```
|
| 57 |
+
python healthcheck.py
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
6. **Run the API server**:
|
| 61 |
+
```
|
| 62 |
+
uvicorn api:app --reload
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
7. **Run the Streamlit application** (in a separate terminal):
|
| 66 |
+
```
|
| 67 |
+
streamlit run app.py
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
## Models Used
|
| 71 |
+
|
| 72 |
+
- **News Summarization**: Extractive summarization using NLTK and NetworkX
|
| 73 |
+
- **Sentiment Analysis**: VADER for sentiment analysis and Hugging Face Transformers
|
| 74 |
+
- **Translation**: Google Translate API via deep-translator library
|
| 75 |
+
- **Text-to-Speech**: Google Text-to-Speech (gTTS) and pyttsx3 as fallback for Hindi conversion
|
| 76 |
+
|
| 77 |
+
## API Documentation
|
| 78 |
+
|
| 79 |
+
### Endpoints
|
| 80 |
+
|
| 81 |
+
- `POST /api/get_news`: Fetches news articles about a company
|
| 82 |
+
- Request body: `{"company_name": "Tesla"}`
|
| 83 |
+
- Returns a list of articles with metadata
|
| 84 |
+
|
| 85 |
+
- `POST /api/analyze_sentiment`: Performs sentiment analysis on articles
|
| 86 |
+
- Request body: `{"articles": [article_list]}`
|
| 87 |
+
- Returns sentiment analysis for each article
|
| 88 |
+
|
| 89 |
+
- `POST /api/generate_speech`: Converts text to Hindi speech
|
| 90 |
+
- Request body: `{"text": "summarized_text"}`
|
| 91 |
+
- Returns a URL to the generated audio file
|
| 92 |
+
|
| 93 |
+
- `POST /api/complete_analysis`: Performs complete analysis including fetching news, sentiment analysis, and generating speech
|
| 94 |
+
- Request body: `{"company_name": "Tesla"}`
|
| 95 |
+
- Returns complete analysis results
|
| 96 |
+
|
| 97 |
+
## Assumptions & Limitations
|
| 98 |
+
|
| 99 |
+
- The application scrapes publicly available news articles that don't require JavaScript rendering
|
| 100 |
+
- Sentiment analysis accuracy depends on the model used and may not capture context-specific nuances
|
| 101 |
+
- Hindi translation and TTS quality may vary based on technical terms
|
| 102 |
+
- The application requires an internet connection to fetch news articles and use cloud-based services
|
| 103 |
+
|
| 104 |
+
## Troubleshooting
|
| 105 |
+
|
| 106 |
+
If you encounter any issues:
|
| 107 |
+
|
| 108 |
+
1. Run the healthcheck script to verify all dependencies are working:
|
| 109 |
+
```
|
| 110 |
+
python healthcheck.py
|
| 111 |
+
```
|
| 112 |
+
|
| 113 |
+
2. Check that you have all the required system dependencies installed (espeak, ffmpeg).
|
| 114 |
+
|
| 115 |
+
3. If you encounter issues with specific components:
|
| 116 |
+
- Translation service requires an internet connection
|
| 117 |
+
- Text-to-speech uses gTTS by default, but falls back to pyttsx3 if needed
|
| 118 |
+
- Transformer models may take time to download on first run
|
| 119 |
+
|
| 120 |
+
## Deployment
|
| 121 |
+
|
| 122 |
+
This application is deployed on Hugging Face Spaces: [Link to deployment]
|
| 123 |
+
|
| 124 |
+
### Using Docker
|
| 125 |
+
|
| 126 |
+
You can also run the application using Docker:
|
| 127 |
+
|
| 128 |
+
```
|
| 129 |
+
docker build -t news-summarization-tts .
|
| 130 |
+
docker run -p 8501:8501 -p 8000:8000 news-summarization-tts
|
| 131 |
+
```
|
| 132 |
+
|
| 133 |
+
## Future Improvements
|
| 134 |
+
|
| 135 |
+
- Add support for more languages
|
| 136 |
+
- Implement advanced NLP techniques for better summarization
|
| 137 |
+
- Improve the user interface with more interactive visualizations
|
| 138 |
+
- Add historical data analysis for tracking sentiment over time
|
| 139 |
+
- Enhance TTS quality with dedicated Hindi speech models
|
| 140 |
+
|
| 141 |
+
## License
|
| 142 |
+
|
| 143 |
+
MIT
|
Spacefile
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Spacefile Docs: https://huggingface.co/docs/hub/spaces-config-reference
|
| 2 |
+
title: News Summarization and TTS
|
| 3 |
+
emoji: 📰
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 8501
|
| 8 |
+
pinned: false
|
api.py
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, HTTPException, Response, File, UploadFile, Form
|
| 2 |
+
from fastapi.responses import FileResponse, JSONResponse
|
| 3 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 4 |
+
from pydantic import BaseModel
|
| 5 |
+
from typing import List, Dict, Any, Optional
|
| 6 |
+
import os
|
| 7 |
+
import json
|
| 8 |
+
import uuid
|
| 9 |
+
import asyncio
|
| 10 |
+
import uvicorn
|
| 11 |
+
from utils import (search_news, analyze_article_sentiment, perform_comparative_analysis,
|
| 12 |
+
translate_to_hindi, text_to_speech, prepare_final_report, NewsArticle)
|
| 13 |
+
|
| 14 |
+
# Initialize FastAPI app
|
| 15 |
+
app = FastAPI(
|
| 16 |
+
title="News Summarization and TTS API",
|
| 17 |
+
description="API for extracting news, performing sentiment analysis, and generating Hindi TTS audio",
|
| 18 |
+
version="1.0.0"
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
# Add CORS middleware
|
| 22 |
+
app.add_middleware(
|
| 23 |
+
CORSMiddleware,
|
| 24 |
+
allow_origins=["*"], # Allow all origins
|
| 25 |
+
allow_credentials=True,
|
| 26 |
+
allow_methods=["*"], # Allow all methods
|
| 27 |
+
allow_headers=["*"], # Allow all headers
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
# Define request/response models
|
| 31 |
+
class CompanyRequest(BaseModel):
|
| 32 |
+
company_name: str
|
| 33 |
+
|
| 34 |
+
class TextToSpeechRequest(BaseModel):
|
| 35 |
+
text: str
|
| 36 |
+
output_filename: Optional[str] = None
|
| 37 |
+
|
| 38 |
+
class SentimentAnalysisRequest(BaseModel):
|
| 39 |
+
articles: List[Dict[str, Any]]
|
| 40 |
+
|
| 41 |
+
class NewsResponse(BaseModel):
|
| 42 |
+
articles: List[Dict[str, Any]]
|
| 43 |
+
|
| 44 |
+
class SentimentResponse(BaseModel):
|
| 45 |
+
sentiment_analysis: Dict[str, Any]
|
| 46 |
+
|
| 47 |
+
class TextToSpeechResponse(BaseModel):
|
| 48 |
+
audio_file: str
|
| 49 |
+
text: str
|
| 50 |
+
|
| 51 |
+
# Create a directory for audio files if it doesn't exist
|
| 52 |
+
os.makedirs("audio_files", exist_ok=True)
|
| 53 |
+
|
| 54 |
+
# API endpoints
|
| 55 |
+
@app.get("/")
|
| 56 |
+
async def root():
|
| 57 |
+
"""Root endpoint to check if API is running."""
|
| 58 |
+
return {"message": "News Summarization and TTS API is running"}
|
| 59 |
+
|
| 60 |
+
@app.post("/api/get_news", response_model=NewsResponse)
|
| 61 |
+
async def get_news(request: CompanyRequest):
|
| 62 |
+
"""Fetch news articles about a specific company."""
|
| 63 |
+
try:
|
| 64 |
+
company_name = request.company_name
|
| 65 |
+
articles = search_news(company_name)
|
| 66 |
+
|
| 67 |
+
if not articles:
|
| 68 |
+
raise HTTPException(status_code=404, detail=f"No news articles found for {company_name}")
|
| 69 |
+
|
| 70 |
+
# Convert NewsArticle objects to dictionaries
|
| 71 |
+
article_data = [article.to_dict() for article in articles]
|
| 72 |
+
|
| 73 |
+
return {"articles": article_data}
|
| 74 |
+
|
| 75 |
+
except Exception as e:
|
| 76 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 77 |
+
|
| 78 |
+
@app.post("/api/analyze_sentiment", response_model=SentimentResponse)
|
| 79 |
+
async def analyze_sentiment(request: SentimentAnalysisRequest):
|
| 80 |
+
"""Analyze sentiment of provided articles."""
|
| 81 |
+
try:
|
| 82 |
+
# Convert dictionaries back to NewsArticle objects
|
| 83 |
+
articles = []
|
| 84 |
+
for article_dict in request.articles:
|
| 85 |
+
article = NewsArticle(
|
| 86 |
+
title=article_dict["title"],
|
| 87 |
+
url=article_dict["url"],
|
| 88 |
+
content=article_dict["content"],
|
| 89 |
+
summary=article_dict.get("summary", ""),
|
| 90 |
+
source=article_dict.get("source", ""),
|
| 91 |
+
date=article_dict.get("date", ""),
|
| 92 |
+
sentiment=article_dict.get("sentiment", ""),
|
| 93 |
+
topics=article_dict.get("topics", [])
|
| 94 |
+
)
|
| 95 |
+
articles.append(article)
|
| 96 |
+
|
| 97 |
+
# Perform detailed sentiment analysis for each article
|
| 98 |
+
detailed_sentiment = [analyze_article_sentiment(article) for article in articles]
|
| 99 |
+
|
| 100 |
+
# Perform comparative analysis
|
| 101 |
+
comparative_analysis = perform_comparative_analysis(articles)
|
| 102 |
+
|
| 103 |
+
return {
|
| 104 |
+
"sentiment_analysis": {
|
| 105 |
+
"detailed_sentiment": detailed_sentiment,
|
| 106 |
+
"comparative_analysis": comparative_analysis
|
| 107 |
+
}
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
except Exception as e:
|
| 111 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 112 |
+
|
| 113 |
+
@app.post("/api/generate_speech", response_model=TextToSpeechResponse)
|
| 114 |
+
async def generate_speech(request: TextToSpeechRequest):
|
| 115 |
+
"""Convert text to Hindi speech."""
|
| 116 |
+
try:
|
| 117 |
+
text = request.text
|
| 118 |
+
|
| 119 |
+
# Generate a unique filename if not provided
|
| 120 |
+
output_filename = request.output_filename
|
| 121 |
+
if not output_filename:
|
| 122 |
+
unique_id = uuid.uuid4().hex
|
| 123 |
+
output_filename = f"audio_files/{unique_id}.mp3"
|
| 124 |
+
elif not output_filename.startswith("audio_files/"):
|
| 125 |
+
output_filename = f"audio_files/{output_filename}"
|
| 126 |
+
|
| 127 |
+
# Translate text to Hindi
|
| 128 |
+
hindi_text = translate_to_hindi(text)
|
| 129 |
+
|
| 130 |
+
# Convert text to speech
|
| 131 |
+
audio_file = text_to_speech(hindi_text, output_filename)
|
| 132 |
+
|
| 133 |
+
if not audio_file:
|
| 134 |
+
raise HTTPException(status_code=500, detail="Failed to generate audio file")
|
| 135 |
+
|
| 136 |
+
return {
|
| 137 |
+
"audio_file": audio_file,
|
| 138 |
+
"text": hindi_text
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
except Exception as e:
|
| 142 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 143 |
+
|
| 144 |
+
@app.post("/api/complete_analysis")
|
| 145 |
+
async def complete_analysis(request: CompanyRequest):
|
| 146 |
+
"""Perform complete analysis for a company."""
|
| 147 |
+
try:
|
| 148 |
+
company_name = request.company_name
|
| 149 |
+
|
| 150 |
+
# Log the start of analysis
|
| 151 |
+
print(f"Starting complete analysis for company: {company_name}")
|
| 152 |
+
|
| 153 |
+
# Step 1: Get news articles
|
| 154 |
+
print("Step 1: Fetching news articles...")
|
| 155 |
+
articles = search_news(company_name, num_articles=5) # Increased from default 3 to 5
|
| 156 |
+
print(f"Found {len(articles)} articles for {company_name}")
|
| 157 |
+
|
| 158 |
+
if not articles:
|
| 159 |
+
raise HTTPException(status_code=404, detail=f"No news articles found for {company_name}")
|
| 160 |
+
|
| 161 |
+
# Step 2: Perform comparative analysis
|
| 162 |
+
print("Step 2: Performing comparative analysis...")
|
| 163 |
+
comparative_analysis = perform_comparative_analysis(articles)
|
| 164 |
+
print("Comparative analysis completed")
|
| 165 |
+
|
| 166 |
+
# Step 3: Prepare final report
|
| 167 |
+
print("Step 3: Preparing final report...")
|
| 168 |
+
final_report = prepare_final_report(company_name, articles, comparative_analysis)
|
| 169 |
+
print("Final report prepared")
|
| 170 |
+
|
| 171 |
+
# Step 4: Generate Hindi TTS
|
| 172 |
+
print("Step 4: Generating Hindi TTS...")
|
| 173 |
+
unique_id = uuid.uuid4().hex
|
| 174 |
+
output_filename = f"audio_files/{unique_id}.mp3"
|
| 175 |
+
|
| 176 |
+
# Use the Hindi summary for TTS
|
| 177 |
+
hindi_text = final_report["Hindi Summary"]
|
| 178 |
+
print(f"Converting Hindi text to speech (length: {len(hindi_text)} characters)")
|
| 179 |
+
|
| 180 |
+
audio_file = text_to_speech(hindi_text, output_filename)
|
| 181 |
+
|
| 182 |
+
# Format the response to match the example output exactly
|
| 183 |
+
formatted_response = {
|
| 184 |
+
"Company": company_name,
|
| 185 |
+
"Articles": final_report["Articles"],
|
| 186 |
+
"Comparative Sentiment Score": {
|
| 187 |
+
"Sentiment Distribution": comparative_analysis["Sentiment Distribution"],
|
| 188 |
+
"Coverage Differences": comparative_analysis["Coverage Differences"],
|
| 189 |
+
"Topic Overlap": {
|
| 190 |
+
"Common Topics": comparative_analysis["Topic Overlap"]["Common Topics Across All"],
|
| 191 |
+
}
|
| 192 |
+
},
|
| 193 |
+
"Final Sentiment Analysis": comparative_analysis["Final Sentiment Analysis"],
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
# Format the unique topics by article to match the expected output exactly
|
| 197 |
+
unique_topics = comparative_analysis["Topic Overlap"]["Unique Topics By Article"]
|
| 198 |
+
for article_idx, topics in unique_topics.items():
|
| 199 |
+
article_num = int(article_idx) + 1
|
| 200 |
+
formatted_response["Comparative Sentiment Score"]["Topic Overlap"][f"Unique Topics in Article {article_num}"] = topics
|
| 201 |
+
|
| 202 |
+
# If we don't have more than 1 article, create some example comparisons to match format
|
| 203 |
+
if len(articles) <= 1:
|
| 204 |
+
formatted_response["Comparative Sentiment Score"]["Coverage Differences"] = [
|
| 205 |
+
{
|
| 206 |
+
"Comparison": f"Only one article about {company_name} was found, limiting comparative analysis.",
|
| 207 |
+
"Impact": "Unable to compare coverage across multiple sources for more comprehensive insights."
|
| 208 |
+
}
|
| 209 |
+
]
|
| 210 |
+
|
| 211 |
+
# Add audio information
|
| 212 |
+
if not audio_file:
|
| 213 |
+
print("Warning: Failed to generate audio file")
|
| 214 |
+
formatted_response["Audio"] = "Failed to generate audio"
|
| 215 |
+
else:
|
| 216 |
+
print(f"Audio file generated: {audio_file}")
|
| 217 |
+
formatted_response["Audio"] = f"[Play Hindi Speech]"
|
| 218 |
+
# Store the actual audio file path in a hidden field
|
| 219 |
+
formatted_response["_audio_file_path"] = audio_file
|
| 220 |
+
|
| 221 |
+
# Add the Hindi Summary to the response as well (needed for rendering in Streamlit)
|
| 222 |
+
formatted_response["Hindi Summary"] = final_report["Hindi Summary"]
|
| 223 |
+
|
| 224 |
+
print("Complete analysis finished successfully")
|
| 225 |
+
return formatted_response
|
| 226 |
+
|
| 227 |
+
except HTTPException as he:
|
| 228 |
+
# Re-raise HTTP exceptions
|
| 229 |
+
print(f"HTTP Exception: {he.detail}")
|
| 230 |
+
raise
|
| 231 |
+
|
| 232 |
+
except Exception as e:
|
| 233 |
+
# For any other exception, provide detailed error information
|
| 234 |
+
import traceback
|
| 235 |
+
error_trace = traceback.format_exc()
|
| 236 |
+
error_message = f"Error processing request: {str(e)}"
|
| 237 |
+
print(f"ERROR: {error_message}")
|
| 238 |
+
print(f"Traceback: {error_trace}")
|
| 239 |
+
|
| 240 |
+
# Return a more user-friendly error message
|
| 241 |
+
user_message = "An error occurred during analysis. "
|
| 242 |
+
|
| 243 |
+
if "timeout" in str(e).lower():
|
| 244 |
+
user_message += "There was a timeout when connecting to news sources. Please try again or try another company name."
|
| 245 |
+
elif "connection" in str(e).lower():
|
| 246 |
+
user_message += "There was a connection issue with one of the news sources. Please check your internet connection."
|
| 247 |
+
elif "not found" in str(e).lower():
|
| 248 |
+
user_message += f"No information could be found for {company_name}. Please try another company name."
|
| 249 |
+
else:
|
| 250 |
+
user_message += "Please try again with a different company name or check the server logs for more details."
|
| 251 |
+
|
| 252 |
+
raise HTTPException(status_code=500, detail=user_message)
|
| 253 |
+
|
| 254 |
+
@app.get("/api/audio/{file_name}")
|
| 255 |
+
async def get_audio(file_name: str):
|
| 256 |
+
"""Serve audio files."""
|
| 257 |
+
file_path = f"audio_files/{file_name}"
|
| 258 |
+
|
| 259 |
+
# Make sure the audio_files directory exists
|
| 260 |
+
os.makedirs("audio_files", exist_ok=True)
|
| 261 |
+
|
| 262 |
+
if not os.path.exists(file_path):
|
| 263 |
+
print(f"Audio file not found: {file_path}")
|
| 264 |
+
# Check if any audio files exist in the directory
|
| 265 |
+
audio_files = os.listdir("audio_files") if os.path.exists("audio_files") else []
|
| 266 |
+
print(f"Available audio files: {audio_files}")
|
| 267 |
+
raise HTTPException(status_code=404, detail=f"Audio file {file_name} not found")
|
| 268 |
+
|
| 269 |
+
try:
|
| 270 |
+
# Verify the file can be opened and is not corrupt
|
| 271 |
+
with open(file_path, "rb") as f:
|
| 272 |
+
file_size = os.path.getsize(file_path)
|
| 273 |
+
print(f"Serving audio file: {file_path} (size: {file_size} bytes)")
|
| 274 |
+
if file_size == 0:
|
| 275 |
+
raise HTTPException(status_code=500, detail="Audio file is empty")
|
| 276 |
+
except Exception as e:
|
| 277 |
+
print(f"Error accessing audio file {file_path}: {str(e)}")
|
| 278 |
+
raise HTTPException(status_code=500, detail=f"Error accessing audio file: {str(e)}")
|
| 279 |
+
|
| 280 |
+
# Set appropriate headers for audio file
|
| 281 |
+
headers = {
|
| 282 |
+
"Cache-Control": "no-cache, no-store, must-revalidate",
|
| 283 |
+
"Pragma": "no-cache",
|
| 284 |
+
"Expires": "0",
|
| 285 |
+
"Content-Disposition": f"attachment; filename={file_name}"
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
# Determine the correct media type based on file extension
|
| 289 |
+
media_type = "audio/mpeg"
|
| 290 |
+
if file_name.lower().endswith(".wav"):
|
| 291 |
+
media_type = "audio/wav"
|
| 292 |
+
|
| 293 |
+
return FileResponse(
|
| 294 |
+
path=file_path,
|
| 295 |
+
media_type=media_type,
|
| 296 |
+
headers=headers,
|
| 297 |
+
filename=file_name
|
| 298 |
+
)
|
| 299 |
+
|
| 300 |
+
@app.post("/api/example_format")
|
| 301 |
+
async def get_example_format(request: CompanyRequest):
|
| 302 |
+
"""
|
| 303 |
+
Get analysis results in the example format specified.
|
| 304 |
+
This endpoint provides results that exactly match the requested output format.
|
| 305 |
+
"""
|
| 306 |
+
try:
|
| 307 |
+
# Get the base analysis
|
| 308 |
+
company_name = request.company_name
|
| 309 |
+
result = await complete_analysis(request)
|
| 310 |
+
|
| 311 |
+
# Format it to match the example output
|
| 312 |
+
formatted_output = {
|
| 313 |
+
"Company": result["Company"],
|
| 314 |
+
"Articles": result["Articles"],
|
| 315 |
+
"Comparative Sentiment Score": {
|
| 316 |
+
"Sentiment Distribution": result["Comparative Sentiment Score"]["Sentiment Distribution"],
|
| 317 |
+
"Coverage Differences": result["Comparative Sentiment Score"]["Coverage Differences"],
|
| 318 |
+
"Topic Overlap": result["Comparative Sentiment Score"]["Topic Overlap"]
|
| 319 |
+
},
|
| 320 |
+
"Final Sentiment Analysis": result["Final Sentiment Analysis"],
|
| 321 |
+
"Audio": "[Play Hindi Speech]" if result.get("Audio") else "No audio available"
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
+
return formatted_output
|
| 325 |
+
|
| 326 |
+
except HTTPException:
|
| 327 |
+
raise
|
| 328 |
+
except Exception as e:
|
| 329 |
+
raise HTTPException(status_code=500, detail=f"Error generating example format: {str(e)}")
|
| 330 |
+
|
| 331 |
+
if __name__ == "__main__":
|
| 332 |
+
uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=True)
|
app.py
ADDED
|
@@ -0,0 +1,496 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import requests
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
import matplotlib.pyplot as plt
|
| 7 |
+
import seaborn as sns
|
| 8 |
+
import base64
|
| 9 |
+
from io import BytesIO
|
| 10 |
+
from PIL import Image, ImageEnhance
|
| 11 |
+
import time
|
| 12 |
+
from typing import Dict, Any, List
|
| 13 |
+
|
| 14 |
+
# API Base URL - Change this to match your deployment
|
| 15 |
+
API_BASE_URL = "http://localhost:8000"
|
| 16 |
+
|
| 17 |
+
# New function to generate the example output format
|
| 18 |
+
def generate_example_output(company_name: str) -> str:
|
| 19 |
+
"""
|
| 20 |
+
Generate output in the example format for the given company.
|
| 21 |
+
Returns the formatted JSON as a string.
|
| 22 |
+
"""
|
| 23 |
+
try:
|
| 24 |
+
# Make API request to get the analysis data
|
| 25 |
+
url = f"{API_BASE_URL}/api/complete_analysis"
|
| 26 |
+
response = requests.post(url, json={"company_name": company_name})
|
| 27 |
+
response.raise_for_status()
|
| 28 |
+
data = response.json()
|
| 29 |
+
|
| 30 |
+
# Format the data to match the example output format exactly
|
| 31 |
+
formatted_output = {
|
| 32 |
+
"Company": data["Company"],
|
| 33 |
+
"Articles": data["Articles"],
|
| 34 |
+
"Comparative Sentiment Score": {
|
| 35 |
+
"Sentiment Distribution": data["Comparative Sentiment Score"]["Sentiment Distribution"],
|
| 36 |
+
"Coverage Differences": data["Comparative Sentiment Score"]["Coverage Differences"],
|
| 37 |
+
"Topic Overlap": data["Comparative Sentiment Score"]["Topic Overlap"]
|
| 38 |
+
},
|
| 39 |
+
"Final Sentiment Analysis": data["Final Sentiment Analysis"],
|
| 40 |
+
"Audio": "[Play Hindi Speech]" if data.get("Audio") else "No audio available"
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
# Convert to JSON string with proper formatting
|
| 44 |
+
return json.dumps(formatted_output, indent=2)
|
| 45 |
+
|
| 46 |
+
except Exception as e:
|
| 47 |
+
return json.dumps({
|
| 48 |
+
"error": str(e),
|
| 49 |
+
"message": "Failed to generate example output"
|
| 50 |
+
}, indent=2)
|
| 51 |
+
|
| 52 |
+
# Function to run in terminal mode
|
| 53 |
+
def run_terminal_mode():
|
| 54 |
+
"""Run the app in terminal mode to output JSON"""
|
| 55 |
+
print("News Analysis Terminal Mode")
|
| 56 |
+
company_name = input("Enter company name: ")
|
| 57 |
+
print(f"Analyzing {company_name}...")
|
| 58 |
+
output = generate_example_output(company_name)
|
| 59 |
+
print(output)
|
| 60 |
+
|
| 61 |
+
# Check if run directly or imported
|
| 62 |
+
if __name__ == "__main__":
|
| 63 |
+
# Check if terminal mode is requested via command line args
|
| 64 |
+
import sys
|
| 65 |
+
if len(sys.argv) > 1 and sys.argv[1] == "--terminal":
|
| 66 |
+
run_terminal_mode()
|
| 67 |
+
else:
|
| 68 |
+
# Continue with the Streamlit app
|
| 69 |
+
|
| 70 |
+
# App title and description
|
| 71 |
+
st.set_page_config(
|
| 72 |
+
page_title="News Summarization & TTS",
|
| 73 |
+
page_icon="📰",
|
| 74 |
+
layout="wide",
|
| 75 |
+
initial_sidebar_state="expanded"
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
# Custom CSS for better UI
|
| 79 |
+
st.markdown("""
|
| 80 |
+
<style>
|
| 81 |
+
.main-header {
|
| 82 |
+
font-size: 2.5rem;
|
| 83 |
+
font-weight: 700;
|
| 84 |
+
color: #1E3A8A;
|
| 85 |
+
margin-bottom: 1rem;
|
| 86 |
+
}
|
| 87 |
+
.sub-header {
|
| 88 |
+
font-size: 1.5rem;
|
| 89 |
+
font-weight: 600;
|
| 90 |
+
color: #2563EB;
|
| 91 |
+
margin-top: 1rem;
|
| 92 |
+
margin-bottom: 0.5rem;
|
| 93 |
+
}
|
| 94 |
+
.card {
|
| 95 |
+
padding: 1.5rem;
|
| 96 |
+
border-radius: 0.5rem;
|
| 97 |
+
background-color: #F8FAFC;
|
| 98 |
+
border: 1px solid #E2E8F0;
|
| 99 |
+
margin-bottom: 1rem;
|
| 100 |
+
}
|
| 101 |
+
.positive {
|
| 102 |
+
color: #059669;
|
| 103 |
+
font-weight: 600;
|
| 104 |
+
}
|
| 105 |
+
.negative {
|
| 106 |
+
color: #DC2626;
|
| 107 |
+
font-weight: 600;
|
| 108 |
+
}
|
| 109 |
+
.neutral {
|
| 110 |
+
color: #6B7280;
|
| 111 |
+
font-weight: 600;
|
| 112 |
+
}
|
| 113 |
+
.topic-tag {
|
| 114 |
+
display: inline-block;
|
| 115 |
+
padding: 0.25rem 0.5rem;
|
| 116 |
+
border-radius: 2rem;
|
| 117 |
+
background-color: #E5E7EB;
|
| 118 |
+
color: #1F2937;
|
| 119 |
+
font-size: 0.75rem;
|
| 120 |
+
margin-right: 0.5rem;
|
| 121 |
+
margin-bottom: 0.5rem;
|
| 122 |
+
}
|
| 123 |
+
.audio-container {
|
| 124 |
+
width: 100%;
|
| 125 |
+
padding: 1rem;
|
| 126 |
+
background-color: #F3F4F6;
|
| 127 |
+
border-radius: 0.5rem;
|
| 128 |
+
margin-top: 1rem;
|
| 129 |
+
}
|
| 130 |
+
.info-text {
|
| 131 |
+
font-size: 0.9rem;
|
| 132 |
+
color: #4B5563;
|
| 133 |
+
}
|
| 134 |
+
.article-title {
|
| 135 |
+
font-size: 1.2rem;
|
| 136 |
+
font-weight: 600;
|
| 137 |
+
color: #111827;
|
| 138 |
+
margin-bottom: 0.5rem;
|
| 139 |
+
margin-top: 0.5rem;
|
| 140 |
+
}
|
| 141 |
+
.article-summary {
|
| 142 |
+
font-size: 0.9rem;
|
| 143 |
+
color: #374151;
|
| 144 |
+
margin-bottom: 0.5rem;
|
| 145 |
+
}
|
| 146 |
+
.article-meta {
|
| 147 |
+
font-size: 0.8rem;
|
| 148 |
+
color: #6B7280;
|
| 149 |
+
margin-bottom: 0.5rem;
|
| 150 |
+
}
|
| 151 |
+
.section-divider {
|
| 152 |
+
height: 1px;
|
| 153 |
+
background-color: #E5E7EB;
|
| 154 |
+
margin: 1.5rem 0;
|
| 155 |
+
}
|
| 156 |
+
.chart-container {
|
| 157 |
+
background-color: white;
|
| 158 |
+
padding: 1rem;
|
| 159 |
+
border-radius: 0.5rem;
|
| 160 |
+
border: 1px solid #E2E8F0;
|
| 161 |
+
}
|
| 162 |
+
</style>
|
| 163 |
+
""", unsafe_allow_html=True)
|
| 164 |
+
|
| 165 |
+
# Function to make API requests
|
| 166 |
+
def make_api_request(endpoint: str, data: Dict[str, Any] = None, method: str = "POST") -> Dict[str, Any]:
|
| 167 |
+
"""Make API request to the backend."""
|
| 168 |
+
url = f"{API_BASE_URL}{endpoint}"
|
| 169 |
+
|
| 170 |
+
try:
|
| 171 |
+
if method == "GET":
|
| 172 |
+
response = requests.get(url)
|
| 173 |
+
else:
|
| 174 |
+
response = requests.post(url, json=data)
|
| 175 |
+
|
| 176 |
+
response.raise_for_status()
|
| 177 |
+
return response.json()
|
| 178 |
+
except requests.exceptions.ConnectionError:
|
| 179 |
+
st.error("⚠️ Connection Error: Cannot connect to the API server. Please ensure the API server is running at " + API_BASE_URL)
|
| 180 |
+
return {}
|
| 181 |
+
except requests.exceptions.Timeout:
|
| 182 |
+
st.error("⚠️ Timeout Error: The request took too long to complete. Please try again with a different company name.")
|
| 183 |
+
return {}
|
| 184 |
+
except requests.exceptions.HTTPError as e:
|
| 185 |
+
if e.response.status_code == 404:
|
| 186 |
+
st.error("⚠️ No articles found for this company. Please try another company name.")
|
| 187 |
+
elif e.response.status_code == 500:
|
| 188 |
+
# Try to get detailed error message
|
| 189 |
+
try:
|
| 190 |
+
error_detail = e.response.json().get("detail", "Unknown server error")
|
| 191 |
+
st.error(f"⚠️ Server Error: {error_detail}")
|
| 192 |
+
except:
|
| 193 |
+
st.error("⚠️ Internal Server Error: Something went wrong on the server. Please try again later.")
|
| 194 |
+
else:
|
| 195 |
+
st.error(f"⚠️ HTTP Error: {str(e)}")
|
| 196 |
+
return {}
|
| 197 |
+
except Exception as e:
|
| 198 |
+
st.error(f"⚠️ Error: {str(e)}")
|
| 199 |
+
return {}
|
| 200 |
+
|
| 201 |
+
# Function to create sentiment color
|
| 202 |
+
def get_sentiment_color(sentiment: str) -> str:
|
| 203 |
+
"""Return CSS class for sentiment."""
|
| 204 |
+
if sentiment == "Positive":
|
| 205 |
+
return "positive"
|
| 206 |
+
elif sentiment == "Negative":
|
| 207 |
+
return "negative"
|
| 208 |
+
else:
|
| 209 |
+
return "neutral"
|
| 210 |
+
|
| 211 |
+
# Function to create visualization for sentiment distribution
|
| 212 |
+
def plot_sentiment_distribution(sentiment_data: Dict[str, int]):
|
| 213 |
+
"""Create and display a bar chart for sentiment distribution."""
|
| 214 |
+
labels = ["Positive", "Neutral", "Negative"]
|
| 215 |
+
values = [sentiment_data[label] for label in labels]
|
| 216 |
+
colors = ["#059669", "#6B7280", "#DC2626"]
|
| 217 |
+
|
| 218 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
| 219 |
+
ax.bar(labels, values, color=colors)
|
| 220 |
+
ax.set_title("Sentiment Distribution", fontsize=16, fontweight='bold')
|
| 221 |
+
ax.set_ylabel("Number of Articles", fontsize=12)
|
| 222 |
+
ax.grid(axis='y', linestyle='--', alpha=0.7)
|
| 223 |
+
|
| 224 |
+
# Add value labels on top of bars
|
| 225 |
+
for i, v in enumerate(values):
|
| 226 |
+
ax.text(i, v + 0.1, str(v), ha='center', fontweight='bold')
|
| 227 |
+
|
| 228 |
+
return fig
|
| 229 |
+
|
| 230 |
+
# Function to display article information
|
| 231 |
+
def display_article(article: Dict[str, Any], index: int):
|
| 232 |
+
"""Display article information in a card layout."""
|
| 233 |
+
st.markdown(f"<div class='card'>", unsafe_allow_html=True)
|
| 234 |
+
|
| 235 |
+
# Article title and sentiment
|
| 236 |
+
sentiment = article.get("Sentiment", "Neutral")
|
| 237 |
+
sentiment_class = get_sentiment_color(sentiment)
|
| 238 |
+
|
| 239 |
+
st.markdown(f"<h3 class='article-title'>{index+1}. {article['Title']}</h3>", unsafe_allow_html=True)
|
| 240 |
+
st.markdown(f"<span class='{sentiment_class}'>{sentiment}</span>", unsafe_allow_html=True)
|
| 241 |
+
|
| 242 |
+
# Article summary
|
| 243 |
+
st.markdown("<div class='article-summary'>", unsafe_allow_html=True)
|
| 244 |
+
st.markdown(f"{article.get('Summary', 'No summary available.')}", unsafe_allow_html=True)
|
| 245 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
| 246 |
+
|
| 247 |
+
# Topics
|
| 248 |
+
if "Topics" in article and article["Topics"]:
|
| 249 |
+
st.markdown("<div>", unsafe_allow_html=True)
|
| 250 |
+
for topic in article["Topics"]:
|
| 251 |
+
st.markdown(f"<span class='topic-tag'>{topic}</span>", unsafe_allow_html=True)
|
| 252 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
| 253 |
+
|
| 254 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
| 255 |
+
|
| 256 |
+
# App layout
|
| 257 |
+
st.markdown("<h1 class='main-header'>📰 News Summarization & Text-to-Speech</h1>", unsafe_allow_html=True)
|
| 258 |
+
st.markdown("""
|
| 259 |
+
<p class='info-text'>
|
| 260 |
+
This application extracts news articles about a company, performs sentiment analysis, conducts comparative analysis,
|
| 261 |
+
and generates a text-to-speech output in Hindi. Enter a company name to get started.
|
| 262 |
+
</p>
|
| 263 |
+
""", unsafe_allow_html=True)
|
| 264 |
+
|
| 265 |
+
# Sidebar
|
| 266 |
+
st.sidebar.image("https://cdn-icons-png.flaticon.com/512/2593/2593073.png", width=100)
|
| 267 |
+
st.sidebar.title("News Analysis Settings")
|
| 268 |
+
|
| 269 |
+
# Company selection
|
| 270 |
+
company_input_method = st.sidebar.radio(
|
| 271 |
+
"Select company input method:",
|
| 272 |
+
options=["Text Input", "Choose from List"]
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
if company_input_method == "Text Input":
|
| 276 |
+
company_name = st.sidebar.text_input("Enter Company Name:", placeholder="e.g., Tesla")
|
| 277 |
+
else:
|
| 278 |
+
companies = ["Apple", "Google", "Microsoft", "Amazon", "Tesla", "Meta", "Netflix", "Uber", "Airbnb", "Twitter"]
|
| 279 |
+
company_name = st.sidebar.selectbox("Select Company:", companies)
|
| 280 |
+
|
| 281 |
+
# Analysis settings
|
| 282 |
+
max_articles = st.sidebar.slider("Maximum Articles to Analyze:", min_value=5, max_value=20, value=10)
|
| 283 |
+
st.sidebar.markdown("---")
|
| 284 |
+
|
| 285 |
+
# Analysis button
|
| 286 |
+
analyze_button = st.sidebar.button("Analyze Company News", type="primary")
|
| 287 |
+
|
| 288 |
+
# Audio playback settings
|
| 289 |
+
st.sidebar.markdown("## Audio Settings")
|
| 290 |
+
audio_speed = st.sidebar.select_slider("TTS Speech Speed:", options=["Slow", "Normal", "Fast"], value="Normal")
|
| 291 |
+
st.sidebar.markdown("---")
|
| 292 |
+
|
| 293 |
+
# Add option to see JSON in example format
|
| 294 |
+
st.sidebar.markdown("## Developer Options")
|
| 295 |
+
show_json = st.sidebar.checkbox("Show JSON output in example format")
|
| 296 |
+
st.sidebar.markdown("---")
|
| 297 |
+
|
| 298 |
+
# About section
|
| 299 |
+
with st.sidebar.expander("About This App"):
|
| 300 |
+
st.markdown("""
|
| 301 |
+
This application performs:
|
| 302 |
+
- News extraction from multiple sources
|
| 303 |
+
- Sentiment analysis of the content
|
| 304 |
+
- Topic identification and comparative analysis
|
| 305 |
+
- Text-to-speech conversion to Hindi
|
| 306 |
+
|
| 307 |
+
Built with Streamlit, FastAPI, and various NLP tools.
|
| 308 |
+
""")
|
| 309 |
+
|
| 310 |
+
# Main content area
|
| 311 |
+
if analyze_button and company_name:
|
| 312 |
+
with st.spinner(f"Analyzing news for {company_name}... This may take a minute"):
|
| 313 |
+
# Perform complete analysis
|
| 314 |
+
response = make_api_request(
|
| 315 |
+
"/api/complete_analysis",
|
| 316 |
+
{"company_name": company_name}
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
if not response:
|
| 320 |
+
st.error("Failed to retrieve data. Please try again.")
|
| 321 |
+
elif "detail" in response:
|
| 322 |
+
st.error(response["detail"])
|
| 323 |
+
else:
|
| 324 |
+
# Display company header
|
| 325 |
+
st.markdown(f"<h2 class='sub-header'>Analysis Results for {response['Company']}</h2>", unsafe_allow_html=True)
|
| 326 |
+
|
| 327 |
+
# Display sentiment summary
|
| 328 |
+
col1, col2 = st.columns([2, 1])
|
| 329 |
+
|
| 330 |
+
with col1:
|
| 331 |
+
st.markdown("<div class='card'>", unsafe_allow_html=True)
|
| 332 |
+
st.markdown("<h3 class='sub-header'>Sentiment Overview</h3>", unsafe_allow_html=True)
|
| 333 |
+
st.markdown(f"{response['Final Sentiment Analysis']}")
|
| 334 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
| 335 |
+
|
| 336 |
+
with col2:
|
| 337 |
+
sentiment_data = response["Comparative Sentiment Score"]["Sentiment Distribution"]
|
| 338 |
+
fig = plot_sentiment_distribution(sentiment_data)
|
| 339 |
+
st.pyplot(fig)
|
| 340 |
+
|
| 341 |
+
st.markdown("<div class='section-divider'></div>", unsafe_allow_html=True)
|
| 342 |
+
|
| 343 |
+
# Display Hindi TTS audio
|
| 344 |
+
if "Audio" in response and response["Audio"]:
|
| 345 |
+
st.markdown("<h3 class='sub-header'>Hindi Audio Summary</h3>", unsafe_allow_html=True)
|
| 346 |
+
|
| 347 |
+
audio_message = response["Audio"]
|
| 348 |
+
|
| 349 |
+
if audio_message == "Failed to generate audio":
|
| 350 |
+
st.warning("Hindi audio could not be generated. However, you can still read the Hindi text below.")
|
| 351 |
+
else:
|
| 352 |
+
try:
|
| 353 |
+
# Check if the response contains the actual audio file path
|
| 354 |
+
audio_file_path = response.get("_audio_file_path")
|
| 355 |
+
|
| 356 |
+
if audio_file_path:
|
| 357 |
+
# Extract the filename
|
| 358 |
+
audio_filename = os.path.basename(audio_file_path)
|
| 359 |
+
audio_url = f"{API_BASE_URL}/api/audio/{audio_filename}"
|
| 360 |
+
else:
|
| 361 |
+
# If no path is provided, just display a message
|
| 362 |
+
st.info("Audio is available but the path was not provided.")
|
| 363 |
+
audio_url = None
|
| 364 |
+
|
| 365 |
+
if audio_url:
|
| 366 |
+
# Attempt to download the audio file
|
| 367 |
+
audio_response = requests.get(audio_url)
|
| 368 |
+
if audio_response.status_code == 200:
|
| 369 |
+
# Save temporarily
|
| 370 |
+
temp_audio_path = f"temp_audio_{os.path.basename(audio_url)}"
|
| 371 |
+
with open(temp_audio_path, "wb") as f:
|
| 372 |
+
f.write(audio_response.content)
|
| 373 |
+
|
| 374 |
+
# Play from local file
|
| 375 |
+
st.markdown("<div class='audio-container'>", unsafe_allow_html=True)
|
| 376 |
+
st.audio(temp_audio_path, format="audio/mp3")
|
| 377 |
+
|
| 378 |
+
# Display audio download link
|
| 379 |
+
st.markdown(f"<a href='{audio_url}' download='hindi_summary.mp3'>Download Hindi Audio</a>", unsafe_allow_html=True)
|
| 380 |
+
|
| 381 |
+
# Clean up temp file (optional)
|
| 382 |
+
# os.remove(temp_audio_path) # Uncomment to delete after use
|
| 383 |
+
else:
|
| 384 |
+
st.warning(f"Unable to load audio file (HTTP {audio_response.status_code}). You can still read the Hindi text below.")
|
| 385 |
+
else:
|
| 386 |
+
st.info("Hindi audio summary would be available here.")
|
| 387 |
+
except Exception as e:
|
| 388 |
+
st.warning(f"Error playing audio: {str(e)}. You can still read the Hindi text below.")
|
| 389 |
+
|
| 390 |
+
# Display the Hindi text with better formatting
|
| 391 |
+
with st.expander("Show Hindi Text"):
|
| 392 |
+
hindi_text = response.get("Hindi Summary", "Hindi text not available.")
|
| 393 |
+
|
| 394 |
+
# Format the text for better readability
|
| 395 |
+
paragraphs = hindi_text.split("। ")
|
| 396 |
+
|
| 397 |
+
for paragraph in paragraphs:
|
| 398 |
+
if paragraph.strip():
|
| 399 |
+
# Add a period if it doesn't end with one
|
| 400 |
+
if not paragraph.strip().endswith("।"):
|
| 401 |
+
paragraph += "।"
|
| 402 |
+
st.markdown(f"<p style='font-size: 16px; margin-bottom: 10px;'>{paragraph}</p>", unsafe_allow_html=True)
|
| 403 |
+
|
| 404 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
| 405 |
+
|
| 406 |
+
st.markdown("<div class='section-divider'></div>", unsafe_allow_html=True)
|
| 407 |
+
|
| 408 |
+
# Display articles
|
| 409 |
+
st.markdown("<h3 class='sub-header'>News Articles</h3>", unsafe_allow_html=True)
|
| 410 |
+
articles = response.get("Articles", [])
|
| 411 |
+
|
| 412 |
+
if not articles:
|
| 413 |
+
st.info("No articles found for this company.")
|
| 414 |
+
else:
|
| 415 |
+
for i, article in enumerate(articles):
|
| 416 |
+
display_article(article, i)
|
| 417 |
+
|
| 418 |
+
st.markdown("<div class='section-divider'></div>", unsafe_allow_html=True)
|
| 419 |
+
|
| 420 |
+
# Display comparative analysis
|
| 421 |
+
st.markdown("<h3 class='sub-header'>Comparative Analysis</h3>", unsafe_allow_html=True)
|
| 422 |
+
|
| 423 |
+
# Display topic overlap
|
| 424 |
+
topic_data = response["Comparative Sentiment Score"]["Topic Overlap"]
|
| 425 |
+
|
| 426 |
+
col1, col2 = st.columns(2)
|
| 427 |
+
|
| 428 |
+
with col1:
|
| 429 |
+
st.markdown("<div class='card'>", unsafe_allow_html=True)
|
| 430 |
+
st.markdown("<h4>Common Topics</h4>", unsafe_allow_html=True)
|
| 431 |
+
|
| 432 |
+
common_topics = topic_data.get("Common Topics Across All", [])
|
| 433 |
+
if common_topics:
|
| 434 |
+
for topic in common_topics:
|
| 435 |
+
st.markdown(f"<span class='topic-tag'>{topic}</span>", unsafe_allow_html=True)
|
| 436 |
+
else:
|
| 437 |
+
st.info("No common topics found across articles.")
|
| 438 |
+
|
| 439 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
| 440 |
+
|
| 441 |
+
with col2:
|
| 442 |
+
st.markdown("<div class='card'>", unsafe_allow_html=True)
|
| 443 |
+
st.markdown("<h4>Coverage Comparison</h4>", unsafe_allow_html=True)
|
| 444 |
+
|
| 445 |
+
comparisons = response["Comparative Sentiment Score"].get("Coverage Differences", [])
|
| 446 |
+
if comparisons:
|
| 447 |
+
for i, comparison in enumerate(comparisons[:3]): # Show only top 3 comparisons
|
| 448 |
+
st.markdown(f"<p><strong>{i+1}.</strong> {comparison.get('Comparison', '')}</p>", unsafe_allow_html=True)
|
| 449 |
+
st.markdown(f"<p class='info-text'>{comparison.get('Impact', '')}</p>", unsafe_allow_html=True)
|
| 450 |
+
else:
|
| 451 |
+
st.info("No comparative insights available.")
|
| 452 |
+
|
| 453 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
| 454 |
+
|
| 455 |
+
# Display full comparison in expander
|
| 456 |
+
with st.expander("View All Comparisons"):
|
| 457 |
+
comparisons = response["Comparative Sentiment Score"].get("Coverage Differences", [])
|
| 458 |
+
for i, comparison in enumerate(comparisons):
|
| 459 |
+
st.markdown(f"<p><strong>{i+1}.</strong> {comparison.get('Comparison', '')}</p>", unsafe_allow_html=True)
|
| 460 |
+
st.markdown(f"<p class='info-text'>{comparison.get('Impact', '')}</p>", unsafe_allow_html=True)
|
| 461 |
+
st.markdown("<hr>", unsafe_allow_html=True)
|
| 462 |
+
|
| 463 |
+
# Show JSON in example format if requested
|
| 464 |
+
if show_json:
|
| 465 |
+
st.markdown("<div class='section-divider'></div>", unsafe_allow_html=True)
|
| 466 |
+
st.markdown("<h3 class='sub-header'>Example JSON Format</h3>", unsafe_allow_html=True)
|
| 467 |
+
|
| 468 |
+
# Get the formatted JSON
|
| 469 |
+
json_output = generate_example_output(company_name)
|
| 470 |
+
|
| 471 |
+
# Display the JSON in a code block
|
| 472 |
+
st.code(json_output, language="json")
|
| 473 |
+
else:
|
| 474 |
+
# Display placeholder
|
| 475 |
+
st.markdown("<div class='card'>", unsafe_allow_html=True)
|
| 476 |
+
st.markdown("<h3 class='sub-header'>Enter a Company Name to Begin Analysis</h3>", unsafe_allow_html=True)
|
| 477 |
+
st.markdown("""
|
| 478 |
+
<p class='info-text'>
|
| 479 |
+
This application will:
|
| 480 |
+
</p>
|
| 481 |
+
<ul class='info-text'>
|
| 482 |
+
<li>Extract news articles from multiple sources</li>
|
| 483 |
+
<li>Analyze sentiment (positive, negative, neutral)</li>
|
| 484 |
+
<li>Identify key topics in each article</li>
|
| 485 |
+
<li>Perform comparative analysis across articles</li>
|
| 486 |
+
<li>Generate Hindi speech output summarizing the findings</li>
|
| 487 |
+
</ul>
|
| 488 |
+
""", unsafe_allow_html=True)
|
| 489 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
| 490 |
+
|
| 491 |
+
# Sample output image
|
| 492 |
+
st.image("https://miro.medium.com/max/1400/1*Ger-949PgQnaje2oa9XMdw.png", caption="Sample sentiment analysis visualization")
|
| 493 |
+
|
| 494 |
+
# Footer
|
| 495 |
+
st.markdown("<div class='section-divider'></div>", unsafe_allow_html=True)
|
| 496 |
+
st.markdown("<p class='info-text' style='text-align: center;'>News Summarization & Text-to-Speech Application | Developed with Streamlit and FastAPI</p>", unsafe_allow_html=True)
|
generate_json_output.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
|
| 3 |
+
import requests
|
| 4 |
+
import json
|
| 5 |
+
import sys
|
| 6 |
+
|
| 7 |
+
def generate_json_output(company_name, api_url="http://localhost:8000"):
|
| 8 |
+
"""
|
| 9 |
+
Generate output in the example format for the given company.
|
| 10 |
+
|
| 11 |
+
Args:
|
| 12 |
+
company_name (str): Name of the company to analyze
|
| 13 |
+
api_url (str): Base URL of the API
|
| 14 |
+
|
| 15 |
+
Returns:
|
| 16 |
+
str: Formatted JSON string
|
| 17 |
+
"""
|
| 18 |
+
try:
|
| 19 |
+
# Make API request to get the analysis data
|
| 20 |
+
url = f"{api_url}/api/complete_analysis"
|
| 21 |
+
response = requests.post(url, json={"company_name": company_name})
|
| 22 |
+
response.raise_for_status()
|
| 23 |
+
data = response.json()
|
| 24 |
+
|
| 25 |
+
# Format the data to match the example output format exactly
|
| 26 |
+
formatted_output = {
|
| 27 |
+
"Company": data["Company"],
|
| 28 |
+
"Articles": data["Articles"],
|
| 29 |
+
"Comparative Sentiment Score": {
|
| 30 |
+
"Sentiment Distribution": data["Comparative Sentiment Score"]["Sentiment Distribution"],
|
| 31 |
+
"Coverage Differences": data["Comparative Sentiment Score"]["Coverage Differences"],
|
| 32 |
+
"Topic Overlap": data["Comparative Sentiment Score"]["Topic Overlap"]
|
| 33 |
+
},
|
| 34 |
+
"Final Sentiment Analysis": data["Final Sentiment Analysis"],
|
| 35 |
+
"Audio": "[Play Hindi Speech]" if data.get("Audio") else "No audio available"
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
# Convert to JSON string with proper formatting
|
| 39 |
+
return json.dumps(formatted_output, indent=2)
|
| 40 |
+
|
| 41 |
+
except Exception as e:
|
| 42 |
+
return json.dumps({
|
| 43 |
+
"error": str(e),
|
| 44 |
+
"message": "Failed to generate example output"
|
| 45 |
+
}, indent=2)
|
| 46 |
+
|
| 47 |
+
if __name__ == "__main__":
|
| 48 |
+
# Get company name from command line arguments or prompt for it
|
| 49 |
+
if len(sys.argv) > 1:
|
| 50 |
+
company_name = sys.argv[1]
|
| 51 |
+
else:
|
| 52 |
+
company_name = input("Enter company name: ")
|
| 53 |
+
|
| 54 |
+
print(f"Input:\nCompany Name: {company_name}")
|
| 55 |
+
print("Output:", generate_json_output(company_name))
|
healthcheck.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Healthcheck script to verify the functionality of all components of the application.
|
| 3 |
+
Run this script to check if all dependencies are correctly installed and working.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
import time
|
| 9 |
+
import traceback
|
| 10 |
+
|
| 11 |
+
def run_checks():
|
| 12 |
+
print("Starting health check for News Summarization and TTS Application...")
|
| 13 |
+
checks_passed = 0
|
| 14 |
+
checks_failed = 0
|
| 15 |
+
|
| 16 |
+
# Check 1: Verify imports
|
| 17 |
+
print("\n1. Checking imports...")
|
| 18 |
+
try:
|
| 19 |
+
# Standard libraries
|
| 20 |
+
import json
|
| 21 |
+
import re
|
| 22 |
+
|
| 23 |
+
# Web and API dependencies
|
| 24 |
+
import requests
|
| 25 |
+
import fastapi
|
| 26 |
+
import uvicorn
|
| 27 |
+
import streamlit
|
| 28 |
+
|
| 29 |
+
# Data processing
|
| 30 |
+
import pandas
|
| 31 |
+
import numpy
|
| 32 |
+
import bs4
|
| 33 |
+
|
| 34 |
+
# NLP
|
| 35 |
+
import nltk
|
| 36 |
+
import networkx
|
| 37 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
| 38 |
+
|
| 39 |
+
# ML and Transformers
|
| 40 |
+
import torch
|
| 41 |
+
import transformers
|
| 42 |
+
from transformers import pipeline
|
| 43 |
+
|
| 44 |
+
# TTS and Translation
|
| 45 |
+
import deep_translator
|
| 46 |
+
from deep_translator import GoogleTranslator
|
| 47 |
+
import gtts
|
| 48 |
+
import pyttsx3
|
| 49 |
+
|
| 50 |
+
print("✅ All imports successful.")
|
| 51 |
+
checks_passed += 1
|
| 52 |
+
except ImportError as e:
|
| 53 |
+
print(f"❌ Import error: {str(e)}")
|
| 54 |
+
print(f"Traceback: {traceback.format_exc()}")
|
| 55 |
+
checks_failed += 1
|
| 56 |
+
|
| 57 |
+
# Check 2: Verify NLTK data
|
| 58 |
+
print("\n2. Checking NLTK data...")
|
| 59 |
+
try:
|
| 60 |
+
import nltk
|
| 61 |
+
nltk.data.find('tokenizers/punkt')
|
| 62 |
+
nltk.data.find('corpora/stopwords')
|
| 63 |
+
print("✅ NLTK data verified.")
|
| 64 |
+
checks_passed += 1
|
| 65 |
+
except LookupError as e:
|
| 66 |
+
print(f"❌ NLTK data error: {str(e)}")
|
| 67 |
+
print("Trying to download necessary NLTK data...")
|
| 68 |
+
try:
|
| 69 |
+
nltk.download('punkt')
|
| 70 |
+
nltk.download('stopwords')
|
| 71 |
+
print("✅ NLTK data downloaded successfully.")
|
| 72 |
+
checks_passed += 1
|
| 73 |
+
except Exception as e:
|
| 74 |
+
print(f"❌ Failed to download NLTK data: {str(e)}")
|
| 75 |
+
checks_failed += 1
|
| 76 |
+
|
| 77 |
+
# Check 3: Test translation
|
| 78 |
+
print("\n3. Testing translation service...")
|
| 79 |
+
try:
|
| 80 |
+
from deep_translator import GoogleTranslator
|
| 81 |
+
translator = GoogleTranslator(source='en', target='hi')
|
| 82 |
+
text = "Hello, this is a test."
|
| 83 |
+
translated = translator.translate(text)
|
| 84 |
+
print(f"Original text: {text}")
|
| 85 |
+
print(f"Translated text: {translated}")
|
| 86 |
+
if translated and len(translated) > 0:
|
| 87 |
+
print("✅ Translation service working.")
|
| 88 |
+
checks_passed += 1
|
| 89 |
+
else:
|
| 90 |
+
print("❌ Translation returned empty result.")
|
| 91 |
+
checks_failed += 1
|
| 92 |
+
except Exception as e:
|
| 93 |
+
print(f"❌ Translation error: {str(e)}")
|
| 94 |
+
print(f"Traceback: {traceback.format_exc()}")
|
| 95 |
+
checks_failed += 1
|
| 96 |
+
|
| 97 |
+
# Check 4: Test TTS
|
| 98 |
+
print("\n4. Testing Text-to-Speech service...")
|
| 99 |
+
try:
|
| 100 |
+
from gtts import gTTS
|
| 101 |
+
test_text = "परीक्षण पाठ" # "Test text" in Hindi
|
| 102 |
+
test_file = 'test_audio.mp3'
|
| 103 |
+
|
| 104 |
+
# Try gTTS
|
| 105 |
+
tts = gTTS(text=test_text, lang='hi', slow=False)
|
| 106 |
+
tts.save(test_file)
|
| 107 |
+
|
| 108 |
+
if os.path.exists(test_file) and os.path.getsize(test_file) > 0:
|
| 109 |
+
print("✅ gTTS service working.")
|
| 110 |
+
# Clean up test file
|
| 111 |
+
try:
|
| 112 |
+
os.remove(test_file)
|
| 113 |
+
except:
|
| 114 |
+
pass
|
| 115 |
+
checks_passed += 1
|
| 116 |
+
else:
|
| 117 |
+
print("❌ gTTS failed to generate a valid audio file.")
|
| 118 |
+
checks_failed += 1
|
| 119 |
+
except Exception as e:
|
| 120 |
+
print(f"❌ Text-to-Speech error: {str(e)}")
|
| 121 |
+
print(f"Traceback: {traceback.format_exc()}")
|
| 122 |
+
checks_failed += 1
|
| 123 |
+
|
| 124 |
+
# Check 5: Test sentiment analysis
|
| 125 |
+
print("\n5. Testing sentiment analysis...")
|
| 126 |
+
try:
|
| 127 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
| 128 |
+
analyzer = SentimentIntensityAnalyzer()
|
| 129 |
+
test_text = "This product is excellent and I love it!"
|
| 130 |
+
scores = analyzer.polarity_scores(test_text)
|
| 131 |
+
print(f"Sentiment scores for '{test_text}': {scores}")
|
| 132 |
+
if 'compound' in scores:
|
| 133 |
+
print("✅ Sentiment analysis working.")
|
| 134 |
+
checks_passed += 1
|
| 135 |
+
else:
|
| 136 |
+
print("❌ Sentiment analysis returned unexpected result.")
|
| 137 |
+
checks_failed += 1
|
| 138 |
+
except Exception as e:
|
| 139 |
+
print(f"❌ Sentiment analysis error: {str(e)}")
|
| 140 |
+
print(f"Traceback: {traceback.format_exc()}")
|
| 141 |
+
checks_failed += 1
|
| 142 |
+
|
| 143 |
+
# Check 6: Test Transformers
|
| 144 |
+
print("\n6. Testing Transformer models...")
|
| 145 |
+
try:
|
| 146 |
+
from transformers import pipeline
|
| 147 |
+
sentiment_task = pipeline("sentiment-analysis", return_all_scores=False)
|
| 148 |
+
result = sentiment_task("I love using this application!")
|
| 149 |
+
print(f"Transformer sentiment analysis result: {result}")
|
| 150 |
+
print("✅ Transformer models working.")
|
| 151 |
+
checks_passed += 1
|
| 152 |
+
except Exception as e:
|
| 153 |
+
print(f"❌ Transformer models error: {str(e)}")
|
| 154 |
+
print(f"Traceback: {traceback.format_exc()}")
|
| 155 |
+
checks_failed += 1
|
| 156 |
+
|
| 157 |
+
# Summary
|
| 158 |
+
print("\n" + "="*50)
|
| 159 |
+
print(f"Health Check Summary: {checks_passed} checks passed, {checks_failed} checks failed")
|
| 160 |
+
|
| 161 |
+
if checks_failed == 0:
|
| 162 |
+
print("\n✅ All systems operational! The application should run correctly.")
|
| 163 |
+
return True
|
| 164 |
+
else:
|
| 165 |
+
print("\n❌ Some checks failed. Please review the errors above.")
|
| 166 |
+
return False
|
| 167 |
+
|
| 168 |
+
if __name__ == "__main__":
|
| 169 |
+
success = run_checks()
|
| 170 |
+
if not success:
|
| 171 |
+
sys.exit(1)
|
requirements.txt
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core dependencies
|
| 2 |
+
streamlit==1.27.0
|
| 3 |
+
fastapi==0.103.1
|
| 4 |
+
uvicorn==0.23.2
|
| 5 |
+
requests==2.31.0
|
| 6 |
+
beautifulsoup4==4.12.2
|
| 7 |
+
pandas==2.1.0
|
| 8 |
+
numpy==1.25.2
|
| 9 |
+
scipy==1.10.1
|
| 10 |
+
|
| 11 |
+
# NLP and Sentiment Analysis
|
| 12 |
+
transformers==4.33.1
|
| 13 |
+
torch==2.0.1
|
| 14 |
+
nltk==3.8.1
|
| 15 |
+
vaderSentiment==3.3.2
|
| 16 |
+
|
| 17 |
+
# Text-to-Speech
|
| 18 |
+
gTTS==2.3.2
|
| 19 |
+
pyttsx3==2.90
|
| 20 |
+
deep-translator==1.11.4
|
| 21 |
+
|
| 22 |
+
# Data Processing and Visualization
|
| 23 |
+
matplotlib==3.7.3
|
| 24 |
+
seaborn==0.12.2
|
| 25 |
+
scikit-learn==1.3.0
|
| 26 |
+
networkx==3.1
|
| 27 |
+
|
| 28 |
+
# API and Web
|
| 29 |
+
aiohttp==3.8.5
|
| 30 |
+
httpx==0.24.1
|
| 31 |
+
pydantic==2.3.0
|
| 32 |
+
python-dotenv==1.0.0
|
| 33 |
+
python-multipart==0.0.6
|
| 34 |
+
|
| 35 |
+
# HuggingFace Spaces
|
| 36 |
+
huggingface-hub==0.16.4
|
| 37 |
+
|
| 38 |
+
# Added from the code block
|
| 39 |
+
pydub==0.25.1
|
temp_audio_0120e65155dd4f5c8d53a8b7c49336f4.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a2ea50f7581396194b20f1482e8844ee6d62e8c9e192ee2faf7542a8931d13bb
|
| 3 |
+
size 180096
|
temp_audio_0a8a80f832c0405797b7d6475f9e6046.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0491c3041a595550580136a005ee3db4c03fedcbfa79e9321584d8ae398f93c9
|
| 3 |
+
size 183744
|
temp_audio_0c68dc54f23f4498937d6696f9245651.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f1946d2795deec708374fc1da8b974a5f9d50df9d4f250c2b7f4a308e18f276b
|
| 3 |
+
size 153216
|
temp_audio_2888808c976c4d658e5e0dfa52370f0c.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:50f05ef65081b51c35d5a4f723f3c675ac46ed4246acc47f7c8140551c73c9f3
|
| 3 |
+
size 209856
|
temp_audio_2ac3161769b44d67883f8c68bd68828a.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:adf0ccc8d77ff073e0345838e03e80741b51ae824cd4ed59fe9792f722f48e88
|
| 3 |
+
size 185088
|
temp_audio_2d1949225eb4436e963e7e46db963bf9.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2c5eafa4a2a4c29852b879896b1b9f88224468b5d278f338279d43abbd4aa318
|
| 3 |
+
size 156672
|
temp_audio_477f1b5f9c5a49bf956a0c1771ec41d8.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:60519a4cb183ae053ea97f60f81c62d46830b01a36c3805ac5d6cfd01a19d951
|
| 3 |
+
size 240384
|
temp_audio_531e8b49f3fc4d53b3c99c4cbfff71ae.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2d78e22cb94e27f7f054a77698dfcebe42dd9ca0a18467862e8ab09a851dab79
|
| 3 |
+
size 187968
|
temp_audio_55fa301d954b442e9ce58d21be70ed2d.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f94bf442c40592244c8071c82ad3cfc4128ee5d0551e145578dcbd5e418600a2
|
| 3 |
+
size 255552
|
temp_audio_59225e2a079d4cfaa0c112206f4e14bb.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9facfb6215d210bf456e48ba64979e1b99cbffa007e71d686e792dd3d1def645
|
| 3 |
+
size 168384
|
temp_audio_629b84e1f9f44706a7e4082b2b309b07.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fbf5c9e0e6d758be5f9ea772e509fb8a1eb048b326492b3add37db1b0b0976cc
|
| 3 |
+
size 214656
|
temp_audio_84748717c8134d868dcc7633de098f15.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:599086aa66fbcd66da5c08f3d1f8fdcd73f6a8f4edae3184ee76222b1b029714
|
| 3 |
+
size 180096
|
temp_audio_8e113f3e7933446291ab390f892b2345.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:320eaddf4e497e1b4d8fc3305fd597dbf9793890f81f4d6ee52e565e058624d1
|
| 3 |
+
size 133632
|
temp_audio_aaecc4127abd497d988be047fba22731.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:39c4858ae106fe39ac572cd4f815e2f0f917e0dd1dfc7e3b01995629f5f5f3f9
|
| 3 |
+
size 1246848
|
temp_audio_acd4f5f81a244c0ba4db27bddd0801d5.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:334f7c7ed2a55e24a84d56bdb33944a3c7df2ef98c3a617d7c4029ee82dc79e6
|
| 3 |
+
size 249216
|
temp_audio_aec8c57313224da590f3447c79a77b89.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6d6246d99ae6ab85dc901b1e749b9c7c9304ff887c61251f1201b90b8e916146
|
| 3 |
+
size 156864
|
temp_audio_c8c07565663d4c69bd740dc7954a921a.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:36d8ccd2ec4e6bf63d85b1f91e5b12921d26cd27b69c2e253565e277143ddaf4
|
| 3 |
+
size 153216
|
temp_audio_d322c8c906234dc98675709f214ed49c.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a833175fb3df2f9fb43514a3bc481b362868383c30791d183a094d2cb67da1f4
|
| 3 |
+
size 209856
|
temp_audio_f0761eacc6b34a14910b39c7ee84b19d.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c2d177a857b1b569693aaa7e24a849efe89be48073d8fd3e57447bd56c3b0176
|
| 3 |
+
size 1247040
|
temp_audio_f63cf8928c004b7d8a0cfaffebf44614.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1625880ee058e781624428db996f22f359bba1e85b5430ea75bc13abdae93a56
|
| 3 |
+
size 218304
|
utils.py
ADDED
|
@@ -0,0 +1,1132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import re
|
| 3 |
+
import os
|
| 4 |
+
import json
|
| 5 |
+
import time
|
| 6 |
+
from typing import List, Dict, Any, Tuple, Optional
|
| 7 |
+
from bs4 import BeautifulSoup
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import numpy as np
|
| 10 |
+
from nltk.corpus import stopwords
|
| 11 |
+
from nltk.tokenize import sent_tokenize, word_tokenize
|
| 12 |
+
from nltk.cluster.util import cosine_distance
|
| 13 |
+
import networkx as nx
|
| 14 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
| 15 |
+
from collections import Counter
|
| 16 |
+
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
|
| 17 |
+
from deep_translator import GoogleTranslator
|
| 18 |
+
from gtts import gTTS
|
| 19 |
+
import pyttsx3
|
| 20 |
+
|
| 21 |
+
# Download necessary NLTK data
|
| 22 |
+
import nltk
|
| 23 |
+
try:
|
| 24 |
+
nltk.data.find('tokenizers/punkt')
|
| 25 |
+
nltk.data.find('corpora/stopwords')
|
| 26 |
+
except LookupError:
|
| 27 |
+
nltk.download('punkt')
|
| 28 |
+
nltk.download('stopwords')
|
| 29 |
+
|
| 30 |
+
# Initialize sentiment analyzer
|
| 31 |
+
vader_analyzer = SentimentIntensityAnalyzer()
|
| 32 |
+
|
| 33 |
+
# Initialize advanced sentiment model
|
| 34 |
+
sentiment_model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
|
| 35 |
+
sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
|
| 36 |
+
sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name)
|
| 37 |
+
advanced_sentiment = pipeline("sentiment-analysis", model=sentiment_model, tokenizer=sentiment_tokenizer)
|
| 38 |
+
|
| 39 |
+
# Initialize translator
|
| 40 |
+
translator = GoogleTranslator(source='en', target='hi')
|
| 41 |
+
|
| 42 |
+
class NewsArticle:
|
| 43 |
+
def __init__(self, title: str, url: str, content: str, summary: str = "", source: str = "",
|
| 44 |
+
date: str = "", sentiment: str = "", topics: List[str] = None):
|
| 45 |
+
self.title = title
|
| 46 |
+
self.url = url
|
| 47 |
+
self.content = content
|
| 48 |
+
self.summary = summary if summary else self.generate_summary(content)
|
| 49 |
+
self.source = source
|
| 50 |
+
self.date = date
|
| 51 |
+
self.sentiment = sentiment if sentiment else self.analyze_sentiment(content, title)
|
| 52 |
+
self.topics = topics if topics else self.extract_topics(content)
|
| 53 |
+
|
| 54 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 55 |
+
return {
|
| 56 |
+
"title": self.title,
|
| 57 |
+
"url": self.url,
|
| 58 |
+
"content": self.content,
|
| 59 |
+
"summary": self.summary,
|
| 60 |
+
"source": self.source,
|
| 61 |
+
"date": self.date,
|
| 62 |
+
"sentiment": self.sentiment,
|
| 63 |
+
"topics": self.topics
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
@staticmethod
|
| 67 |
+
def analyze_sentiment(text: str, title: str = "") -> str:
|
| 68 |
+
"""
|
| 69 |
+
Analyze sentiment using a combination of methods for more accurate results.
|
| 70 |
+
We give more weight to the title sentiment and use advanced model when possible.
|
| 71 |
+
"""
|
| 72 |
+
# Set thresholds for VADER sentiment
|
| 73 |
+
threshold_positive = 0.05 # Default 0.05
|
| 74 |
+
threshold_negative = -0.05 # Default -0.05
|
| 75 |
+
|
| 76 |
+
# Use VADER for basic sentiment analysis on both title and content
|
| 77 |
+
try:
|
| 78 |
+
title_scores = vader_analyzer.polarity_scores(title) if title else {'compound': 0}
|
| 79 |
+
content_scores = vader_analyzer.polarity_scores(text)
|
| 80 |
+
|
| 81 |
+
# Weight the title more heavily (title sentiment is often more reliable)
|
| 82 |
+
title_weight = 0.6 if title else 0
|
| 83 |
+
content_weight = 1.0 - title_weight
|
| 84 |
+
|
| 85 |
+
compound_score = (title_weight * title_scores['compound']) + (content_weight * content_scores['compound'])
|
| 86 |
+
|
| 87 |
+
# Try to use the advanced model for additional insight (for short texts)
|
| 88 |
+
advanced_result = None
|
| 89 |
+
advanced_score = 0
|
| 90 |
+
|
| 91 |
+
try:
|
| 92 |
+
# Use title + first part of content for advanced model
|
| 93 |
+
sample_text = title + ". " + text[:300] if title else text[:300]
|
| 94 |
+
advanced_result = advanced_sentiment(sample_text)[0]
|
| 95 |
+
|
| 96 |
+
# Map advanced model results to a -1 to 1 scale similar to VADER
|
| 97 |
+
label = advanced_result['label']
|
| 98 |
+
confidence = advanced_result['score']
|
| 99 |
+
|
| 100 |
+
# Map the 1-5 star rating to a -1 to 1 scale
|
| 101 |
+
if label == '1 star' or label == '2 stars':
|
| 102 |
+
advanced_score = -confidence
|
| 103 |
+
elif label == '4 stars' or label == '5 stars':
|
| 104 |
+
advanced_score = confidence
|
| 105 |
+
else: # 3 stars is neutral
|
| 106 |
+
advanced_score = 0
|
| 107 |
+
|
| 108 |
+
# Combine VADER and advanced model scores
|
| 109 |
+
# Give more weight to advanced model when confidence is high
|
| 110 |
+
if confidence > 0.8:
|
| 111 |
+
compound_score = (0.4 * compound_score) + (0.6 * advanced_score)
|
| 112 |
+
else:
|
| 113 |
+
compound_score = (0.7 * compound_score) + (0.3 * advanced_score)
|
| 114 |
+
|
| 115 |
+
except Exception as e:
|
| 116 |
+
print(f"Advanced sentiment analysis failed: {str(e)}")
|
| 117 |
+
# Continue with just VADER if advanced model fails
|
| 118 |
+
pass
|
| 119 |
+
|
| 120 |
+
# Fine-grained sentiment mapping
|
| 121 |
+
if compound_score >= 0.3:
|
| 122 |
+
return "Positive"
|
| 123 |
+
elif compound_score >= threshold_positive:
|
| 124 |
+
return "Slightly Positive"
|
| 125 |
+
elif compound_score <= -0.3:
|
| 126 |
+
return "Negative"
|
| 127 |
+
elif compound_score <= threshold_negative:
|
| 128 |
+
return "Slightly Negative"
|
| 129 |
+
else:
|
| 130 |
+
return "Neutral"
|
| 131 |
+
|
| 132 |
+
except Exception as e:
|
| 133 |
+
print(f"Sentiment analysis error: {str(e)}")
|
| 134 |
+
return "Neutral" # Default fallback
|
| 135 |
+
|
| 136 |
+
@staticmethod
|
| 137 |
+
def generate_summary(text: str, num_sentences: int = 5) -> str:
|
| 138 |
+
# Generate summary using extractive summarization
|
| 139 |
+
if not text or len(text) < 100:
|
| 140 |
+
return text
|
| 141 |
+
|
| 142 |
+
# Tokenize sentences
|
| 143 |
+
sentences = sent_tokenize(text)
|
| 144 |
+
if len(sentences) <= num_sentences:
|
| 145 |
+
return text
|
| 146 |
+
|
| 147 |
+
# Calculate sentence similarity and rank them
|
| 148 |
+
similarity_matrix = build_similarity_matrix(sentences)
|
| 149 |
+
scores = nx.pagerank(nx.from_numpy_array(similarity_matrix))
|
| 150 |
+
|
| 151 |
+
# Select top sentences
|
| 152 |
+
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
|
| 153 |
+
summary_sentences = [ranked_sentences[i][1] for i in range(min(num_sentences, len(ranked_sentences)))]
|
| 154 |
+
|
| 155 |
+
# Maintain original order
|
| 156 |
+
original_order = []
|
| 157 |
+
for sentence in sentences:
|
| 158 |
+
if sentence in summary_sentences and sentence not in original_order:
|
| 159 |
+
original_order.append(sentence)
|
| 160 |
+
if len(original_order) >= num_sentences:
|
| 161 |
+
break
|
| 162 |
+
|
| 163 |
+
return " ".join(original_order)
|
| 164 |
+
|
| 165 |
+
@staticmethod
|
| 166 |
+
def extract_topics(text: str, num_topics: int = 5) -> List[str]:
|
| 167 |
+
# Extract key topics from text based on term frequency
|
| 168 |
+
stop_words = set(stopwords.words('english'))
|
| 169 |
+
words = word_tokenize(text.lower())
|
| 170 |
+
|
| 171 |
+
# Filter out stopwords and short words
|
| 172 |
+
filtered_words = [word for word in words if word.isalpha() and word not in stop_words and len(word) > 3]
|
| 173 |
+
|
| 174 |
+
# Count word frequencies
|
| 175 |
+
word_counts = Counter(filtered_words)
|
| 176 |
+
|
| 177 |
+
# Return most common words as topics
|
| 178 |
+
topics = [word for word, _ in word_counts.most_common(num_topics)]
|
| 179 |
+
return topics
|
| 180 |
+
|
| 181 |
+
def build_similarity_matrix(sentences: List[str]) -> np.ndarray:
|
| 182 |
+
"""Build similarity matrix for sentences based on cosine similarity."""
|
| 183 |
+
# Number of sentences
|
| 184 |
+
n = len(sentences)
|
| 185 |
+
|
| 186 |
+
# Initialize similarity matrix
|
| 187 |
+
similarity_matrix = np.zeros((n, n))
|
| 188 |
+
|
| 189 |
+
# Calculate similarity between each pair of sentences
|
| 190 |
+
for i in range(n):
|
| 191 |
+
for j in range(n):
|
| 192 |
+
if i != j:
|
| 193 |
+
similarity_matrix[i][j] = sentence_similarity(sentences[i], sentences[j])
|
| 194 |
+
|
| 195 |
+
return similarity_matrix
|
| 196 |
+
|
| 197 |
+
def sentence_similarity(sent1: str, sent2: str) -> float:
|
| 198 |
+
"""Calculate similarity between two sentences using cosine similarity."""
|
| 199 |
+
# Tokenize sentences
|
| 200 |
+
words1 = [word.lower() for word in word_tokenize(sent1) if word.isalpha()]
|
| 201 |
+
words2 = [word.lower() for word in word_tokenize(sent2) if word.isalpha()]
|
| 202 |
+
|
| 203 |
+
# Get all unique words
|
| 204 |
+
all_words = list(set(words1 + words2))
|
| 205 |
+
|
| 206 |
+
# Create word vectors
|
| 207 |
+
vector1 = [1 if word in words1 else 0 for word in all_words]
|
| 208 |
+
vector2 = [1 if word in words2 else 0 for word in all_words]
|
| 209 |
+
|
| 210 |
+
# Calculate cosine similarity
|
| 211 |
+
if not any(vector1) or not any(vector2):
|
| 212 |
+
return 0.0
|
| 213 |
+
|
| 214 |
+
return 1 - cosine_distance(vector1, vector2)
|
| 215 |
+
|
| 216 |
+
def search_news(company_name: str, num_articles: int = 10) -> List[NewsArticle]:
|
| 217 |
+
"""Search for news articles about a given company."""
|
| 218 |
+
# List to store articles
|
| 219 |
+
articles = []
|
| 220 |
+
|
| 221 |
+
# Define search queries and news sources
|
| 222 |
+
search_queries = [
|
| 223 |
+
f"{company_name} news",
|
| 224 |
+
f"{company_name} financial news",
|
| 225 |
+
f"{company_name} business news",
|
| 226 |
+
f"{company_name} recent news",
|
| 227 |
+
f"{company_name} company news",
|
| 228 |
+
f"{company_name} stock",
|
| 229 |
+
f"{company_name} market"
|
| 230 |
+
]
|
| 231 |
+
|
| 232 |
+
# Updated news sources with more reliable sources
|
| 233 |
+
news_sources = [
|
| 234 |
+
{
|
| 235 |
+
"base_url": "https://finance.yahoo.com/quote/",
|
| 236 |
+
"article_patterns": ["news", "finance", "articles"],
|
| 237 |
+
"direct_access": True
|
| 238 |
+
},
|
| 239 |
+
{
|
| 240 |
+
"base_url": "https://www.reuters.com/search/news?blob=",
|
| 241 |
+
"article_patterns": ["article", "business", "companies", "markets"],
|
| 242 |
+
"direct_access": False
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"base_url": "https://www.marketwatch.com/search?q=",
|
| 246 |
+
"article_patterns": ["story", "articles", "news"],
|
| 247 |
+
"direct_access": False
|
| 248 |
+
},
|
| 249 |
+
{
|
| 250 |
+
"base_url": "https://www.fool.com/search?q=",
|
| 251 |
+
"article_patterns": ["article", "investing", "stock"],
|
| 252 |
+
"direct_access": False
|
| 253 |
+
},
|
| 254 |
+
{
|
| 255 |
+
"base_url": "https://seekingalpha.com/search?q=",
|
| 256 |
+
"article_patterns": ["article", "news", "stock", "analysis"],
|
| 257 |
+
"direct_access": False
|
| 258 |
+
},
|
| 259 |
+
{
|
| 260 |
+
"base_url": "https://www.zacks.com/search.php?q=",
|
| 261 |
+
"article_patterns": ["stock", "research", "analyst"],
|
| 262 |
+
"direct_access": False
|
| 263 |
+
},
|
| 264 |
+
{
|
| 265 |
+
"base_url": "https://economictimes.indiatimes.com/search?q=",
|
| 266 |
+
"article_patterns": ["articleshow", "news", "industry"],
|
| 267 |
+
"direct_access": False
|
| 268 |
+
},
|
| 269 |
+
{
|
| 270 |
+
"base_url": "https://www.bloomberg.com/search?query=",
|
| 271 |
+
"article_patterns": ["news", "articles"],
|
| 272 |
+
"direct_access": False
|
| 273 |
+
}
|
| 274 |
+
]
|
| 275 |
+
|
| 276 |
+
print(f"Starting search for news about {company_name}...")
|
| 277 |
+
|
| 278 |
+
# Search each source with each query until we have enough articles
|
| 279 |
+
for query in search_queries:
|
| 280 |
+
if len(articles) >= num_articles:
|
| 281 |
+
break
|
| 282 |
+
|
| 283 |
+
for source in news_sources:
|
| 284 |
+
if len(articles) >= num_articles:
|
| 285 |
+
break
|
| 286 |
+
|
| 287 |
+
try:
|
| 288 |
+
source_base = source["base_url"]
|
| 289 |
+
article_patterns = source["article_patterns"]
|
| 290 |
+
direct_access = source["direct_access"]
|
| 291 |
+
|
| 292 |
+
# Construct search URL
|
| 293 |
+
if direct_access:
|
| 294 |
+
# Try to fetch the stock symbol for Yahoo Finance
|
| 295 |
+
if "yahoo" in source_base:
|
| 296 |
+
try:
|
| 297 |
+
# First try the company name directly (for known tickers)
|
| 298 |
+
search_url = f"{source_base}{company_name}/news"
|
| 299 |
+
print(f"Trying direct ticker access: {search_url}")
|
| 300 |
+
|
| 301 |
+
# Fetch to check if valid
|
| 302 |
+
headers = {
|
| 303 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
| 304 |
+
}
|
| 305 |
+
test_response = requests.get(search_url, headers=headers, timeout=10)
|
| 306 |
+
|
| 307 |
+
# If we got a 404, try searching for the symbol first
|
| 308 |
+
if test_response.status_code == 404:
|
| 309 |
+
print("Company name not a valid ticker, searching for symbol...")
|
| 310 |
+
symbol_url = f"https://finance.yahoo.com/lookup?s={company_name}"
|
| 311 |
+
symbol_response = requests.get(symbol_url, headers=headers, timeout=10)
|
| 312 |
+
|
| 313 |
+
if symbol_response.status_code == 200:
|
| 314 |
+
symbol_soup = BeautifulSoup(symbol_response.text, 'html.parser')
|
| 315 |
+
# Try to find the first stock symbol result
|
| 316 |
+
symbol_row = symbol_soup.select_one("tr.data-row0")
|
| 317 |
+
if symbol_row:
|
| 318 |
+
symbol_cell = symbol_row.select_one("td:first-child a")
|
| 319 |
+
if symbol_cell:
|
| 320 |
+
symbol = symbol_cell.text.strip()
|
| 321 |
+
search_url = f"{source_base}{symbol}/news"
|
| 322 |
+
print(f"Found symbol {symbol}, using URL: {search_url}")
|
| 323 |
+
except Exception as e:
|
| 324 |
+
print(f"Error getting stock symbol: {str(e)}")
|
| 325 |
+
search_url = f"{source_base}{company_name}/news"
|
| 326 |
+
else:
|
| 327 |
+
search_url = f"{source_base}{company_name}/news"
|
| 328 |
+
else:
|
| 329 |
+
search_url = f"{source_base}{query.replace(' ', '+')}"
|
| 330 |
+
|
| 331 |
+
print(f"Searching {search_url}")
|
| 332 |
+
|
| 333 |
+
# Fetch search results with retry mechanism
|
| 334 |
+
max_retries = 3
|
| 335 |
+
retry_count = 0
|
| 336 |
+
response = None
|
| 337 |
+
|
| 338 |
+
while retry_count < max_retries:
|
| 339 |
+
try:
|
| 340 |
+
headers = {
|
| 341 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
| 342 |
+
"Accept": "text/html,application/xhtml+xml,application/xml",
|
| 343 |
+
"Accept-Language": "en-US,en;q=0.9",
|
| 344 |
+
"Referer": "https://www.google.com/"
|
| 345 |
+
}
|
| 346 |
+
response = requests.get(search_url, headers=headers, timeout=15)
|
| 347 |
+
if response.status_code == 200:
|
| 348 |
+
break
|
| 349 |
+
retry_count += 1
|
| 350 |
+
print(f"Retry {retry_count}/{max_retries} for {search_url} (status: {response.status_code})")
|
| 351 |
+
time.sleep(1) # Short delay before retry
|
| 352 |
+
except Exception as e:
|
| 353 |
+
retry_count += 1
|
| 354 |
+
print(f"Request error (attempt {retry_count}/{max_retries}): {str(e)}")
|
| 355 |
+
time.sleep(1)
|
| 356 |
+
|
| 357 |
+
if not response or response.status_code != 200:
|
| 358 |
+
print(f"Failed to fetch results from {search_url} after {max_retries} attempts")
|
| 359 |
+
continue
|
| 360 |
+
|
| 361 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 362 |
+
|
| 363 |
+
# Extract article links - using more flexible patterns
|
| 364 |
+
links = soup.find_all('a', href=True)
|
| 365 |
+
article_links = []
|
| 366 |
+
|
| 367 |
+
# Domain for resolving relative URLs
|
| 368 |
+
domain = response.url.split('/')[0] + '//' + response.url.split('/')[2]
|
| 369 |
+
print(f"Domain for resolving URLs: {domain}")
|
| 370 |
+
|
| 371 |
+
for link in links:
|
| 372 |
+
href = link['href']
|
| 373 |
+
link_text = link.text.strip()
|
| 374 |
+
|
| 375 |
+
# Skip empty links or navigation elements
|
| 376 |
+
if not link_text or len(link_text) < 10 or href.startswith('#'):
|
| 377 |
+
continue
|
| 378 |
+
|
| 379 |
+
# Check if the link matches any of our article patterns
|
| 380 |
+
is_article_link = False
|
| 381 |
+
for pattern in article_patterns:
|
| 382 |
+
if pattern in href.lower():
|
| 383 |
+
is_article_link = True
|
| 384 |
+
break
|
| 385 |
+
|
| 386 |
+
# Check for the company name in link text or URL (less restrictive now)
|
| 387 |
+
contains_company = (
|
| 388 |
+
company_name.lower() in link_text.lower() or
|
| 389 |
+
company_name.lower() in href.lower()
|
| 390 |
+
)
|
| 391 |
+
|
| 392 |
+
if is_article_link or contains_company:
|
| 393 |
+
# Convert relative URLs to absolute
|
| 394 |
+
if href.startswith('/'):
|
| 395 |
+
href = f"{domain}{href}"
|
| 396 |
+
elif not href.startswith(('http://', 'https://')):
|
| 397 |
+
href = f"{domain}/{href}"
|
| 398 |
+
|
| 399 |
+
# Avoid duplicates
|
| 400 |
+
if href not in article_links:
|
| 401 |
+
article_links.append(href)
|
| 402 |
+
print(f"Found potential article: {link_text[:50]}... at {href}")
|
| 403 |
+
|
| 404 |
+
print(f"Found {len(article_links)} potential article links from {search_url}")
|
| 405 |
+
|
| 406 |
+
# Process each article link
|
| 407 |
+
for link in article_links[:5]: # Increased from 3 to 5
|
| 408 |
+
if len(articles) >= num_articles:
|
| 409 |
+
break
|
| 410 |
+
|
| 411 |
+
try:
|
| 412 |
+
print(f"Fetching article: {link}")
|
| 413 |
+
article_response = requests.get(link, headers=headers, timeout=15)
|
| 414 |
+
|
| 415 |
+
if article_response.status_code != 200:
|
| 416 |
+
print(f"Failed to fetch article: {article_response.status_code}")
|
| 417 |
+
continue
|
| 418 |
+
|
| 419 |
+
article_soup = BeautifulSoup(article_response.text, 'html.parser')
|
| 420 |
+
|
| 421 |
+
# Extract article title - more robust method
|
| 422 |
+
title = None
|
| 423 |
+
|
| 424 |
+
# Try different elements that could contain the title
|
| 425 |
+
for title_tag in ['h1', 'h2', '.headline', '.title', 'title']:
|
| 426 |
+
if title:
|
| 427 |
+
break
|
| 428 |
+
|
| 429 |
+
if title_tag.startswith('.'):
|
| 430 |
+
elements = article_soup.select(title_tag)
|
| 431 |
+
else:
|
| 432 |
+
elements = article_soup.find_all(title_tag)
|
| 433 |
+
|
| 434 |
+
for element in elements:
|
| 435 |
+
candidate = element.text.strip()
|
| 436 |
+
if len(candidate) > 5 and len(candidate) < 200: # Reasonable title length
|
| 437 |
+
title = candidate
|
| 438 |
+
break
|
| 439 |
+
|
| 440 |
+
if not title:
|
| 441 |
+
print("Could not find a suitable title")
|
| 442 |
+
continue
|
| 443 |
+
|
| 444 |
+
# Check if title contains company name (case insensitive)
|
| 445 |
+
if company_name.lower() not in title.lower():
|
| 446 |
+
# Try alternative check - sometimes the title doesn't explicitly mention the company
|
| 447 |
+
meta_description = article_soup.find('meta', attrs={'name': 'description'}) or \
|
| 448 |
+
article_soup.find('meta', attrs={'property': 'og:description'})
|
| 449 |
+
|
| 450 |
+
if meta_description and 'content' in meta_description.attrs:
|
| 451 |
+
meta_text = meta_description['content']
|
| 452 |
+
if company_name.lower() not in meta_text.lower():
|
| 453 |
+
# One more check in the page content
|
| 454 |
+
page_text = article_soup.get_text().lower()
|
| 455 |
+
company_mentions = page_text.count(company_name.lower())
|
| 456 |
+
if company_mentions < 2: # Require at least 2 mentions
|
| 457 |
+
print(f"Article doesn't seem to be about {company_name}: {title}")
|
| 458 |
+
continue
|
| 459 |
+
|
| 460 |
+
# Extract article content - improved method
|
| 461 |
+
content = ""
|
| 462 |
+
|
| 463 |
+
# Try multiple content extraction strategies
|
| 464 |
+
content_containers = []
|
| 465 |
+
|
| 466 |
+
# 1. Look for article/main content containers
|
| 467 |
+
for container in ['article', 'main', '.article-body', '.story-body', '.story-content',
|
| 468 |
+
'.article-content', '.content-body', '.entry-content']:
|
| 469 |
+
if container.startswith('.'):
|
| 470 |
+
elements = article_soup.select(container)
|
| 471 |
+
else:
|
| 472 |
+
elements = article_soup.find_all(container)
|
| 473 |
+
|
| 474 |
+
content_containers.extend(elements)
|
| 475 |
+
|
| 476 |
+
# 2. If no specific containers, fallback to div with article-like classes
|
| 477 |
+
if not content_containers:
|
| 478 |
+
for div in article_soup.find_all('div', class_=True):
|
| 479 |
+
classes = div.get('class', [])
|
| 480 |
+
for cls in classes:
|
| 481 |
+
if any(term in cls.lower() for term in ['article', 'story', 'content', 'body', 'text']):
|
| 482 |
+
content_containers.append(div)
|
| 483 |
+
break
|
| 484 |
+
|
| 485 |
+
# 3. Extract paragraphs from containers
|
| 486 |
+
processed_paragraphs = set() # To avoid duplicates
|
| 487 |
+
|
| 488 |
+
for container in content_containers:
|
| 489 |
+
for p in container.find_all('p'):
|
| 490 |
+
p_text = p.text.strip()
|
| 491 |
+
# Avoid very short or duplicate paragraphs
|
| 492 |
+
if len(p_text) > 30 and p_text not in processed_paragraphs:
|
| 493 |
+
content += p_text + " "
|
| 494 |
+
processed_paragraphs.add(p_text)
|
| 495 |
+
|
| 496 |
+
# 4. If still no content, try all paragraphs
|
| 497 |
+
if not content:
|
| 498 |
+
for p in article_soup.find_all('p'):
|
| 499 |
+
p_text = p.text.strip()
|
| 500 |
+
if len(p_text) > 30 and p_text not in processed_paragraphs:
|
| 501 |
+
content += p_text + " "
|
| 502 |
+
processed_paragraphs.add(p_text)
|
| 503 |
+
|
| 504 |
+
content = content.strip()
|
| 505 |
+
|
| 506 |
+
# Skip if content is too short
|
| 507 |
+
if len(content) < 300: # Reduced from 500 to be less restrictive
|
| 508 |
+
print(f"Article content too short: {len(content)} characters")
|
| 509 |
+
continue
|
| 510 |
+
|
| 511 |
+
# Extract source name - more robust method
|
| 512 |
+
source = None
|
| 513 |
+
|
| 514 |
+
# Try to get from meta tags
|
| 515 |
+
meta_site_name = article_soup.find('meta', attrs={'property': 'og:site_name'})
|
| 516 |
+
if meta_site_name and 'content' in meta_site_name.attrs:
|
| 517 |
+
source = meta_site_name['content']
|
| 518 |
+
else:
|
| 519 |
+
# Extract from URL
|
| 520 |
+
try:
|
| 521 |
+
from urllib.parse import urlparse
|
| 522 |
+
parsed_url = urlparse(link)
|
| 523 |
+
source = parsed_url.netloc
|
| 524 |
+
except:
|
| 525 |
+
source = response.url.split('/')[2]
|
| 526 |
+
|
| 527 |
+
# Extract date - improved method
|
| 528 |
+
date = ""
|
| 529 |
+
|
| 530 |
+
# Try multiple date extraction strategies
|
| 531 |
+
# 1. Look for time element
|
| 532 |
+
date_tag = article_soup.find('time')
|
| 533 |
+
|
| 534 |
+
# 2. Look for meta tags with date
|
| 535 |
+
if not date and (not date_tag or not date_tag.get('datetime')):
|
| 536 |
+
for meta_name in ['article:published_time', 'date', 'publish-date', 'article:modified_time']:
|
| 537 |
+
meta_date = article_soup.find('meta', attrs={'property': meta_name}) or \
|
| 538 |
+
article_soup.find('meta', attrs={'name': meta_name})
|
| 539 |
+
|
| 540 |
+
if meta_date and 'content' in meta_date.attrs:
|
| 541 |
+
date = meta_date['content']
|
| 542 |
+
break
|
| 543 |
+
|
| 544 |
+
# 3. Look for spans/divs with date-related classes
|
| 545 |
+
if not date:
|
| 546 |
+
date_classes = ['date', 'time', 'published', 'posted', 'datetime']
|
| 547 |
+
for cls in date_classes:
|
| 548 |
+
elements = article_soup.find_all(['span', 'div', 'p'], class_=lambda x: x and cls.lower() in x.lower())
|
| 549 |
+
if elements:
|
| 550 |
+
date = elements[0].text.strip()
|
| 551 |
+
break
|
| 552 |
+
|
| 553 |
+
# If we got this far, we have a valid article
|
| 554 |
+
print(f"Successfully extracted article: {title}")
|
| 555 |
+
|
| 556 |
+
# Create article object and add to list
|
| 557 |
+
article = NewsArticle(
|
| 558 |
+
title=title,
|
| 559 |
+
url=link,
|
| 560 |
+
content=content,
|
| 561 |
+
source=source,
|
| 562 |
+
date=date
|
| 563 |
+
)
|
| 564 |
+
|
| 565 |
+
# Check if similar article already exists to avoid duplicates
|
| 566 |
+
is_duplicate = False
|
| 567 |
+
for existing_article in articles:
|
| 568 |
+
if sentence_similarity(existing_article.title, title) > 0.7: # Lowered threshold
|
| 569 |
+
is_duplicate = True
|
| 570 |
+
print(f"Found duplicate article: {title}")
|
| 571 |
+
break
|
| 572 |
+
|
| 573 |
+
if not is_duplicate:
|
| 574 |
+
articles.append(article)
|
| 575 |
+
print(f"Added article: {title}")
|
| 576 |
+
|
| 577 |
+
except Exception as e:
|
| 578 |
+
print(f"Error processing article {link}: {str(e)}")
|
| 579 |
+
continue
|
| 580 |
+
|
| 581 |
+
except Exception as e:
|
| 582 |
+
print(f"Error searching {source_base} with query {query}: {str(e)}")
|
| 583 |
+
continue
|
| 584 |
+
|
| 585 |
+
# If we couldn't find enough articles, create some dummy articles to prevent errors
|
| 586 |
+
if not articles and num_articles > 0:
|
| 587 |
+
print(f"No articles found for {company_name}. Creating a dummy article to prevent errors.")
|
| 588 |
+
|
| 589 |
+
dummy_article = NewsArticle(
|
| 590 |
+
title=f"{company_name} Information",
|
| 591 |
+
url="#",
|
| 592 |
+
content=f"Information about {company_name} was not found or could not be retrieved. This is a placeholder.",
|
| 593 |
+
source="System",
|
| 594 |
+
date="",
|
| 595 |
+
sentiment="Neutral",
|
| 596 |
+
topics=["information", "company", "placeholder"]
|
| 597 |
+
)
|
| 598 |
+
|
| 599 |
+
articles.append(dummy_article)
|
| 600 |
+
|
| 601 |
+
# Return collected articles
|
| 602 |
+
print(f"Returning {len(articles)} articles for {company_name}")
|
| 603 |
+
return articles[:num_articles]
|
| 604 |
+
|
| 605 |
+
def analyze_article_sentiment(article: NewsArticle) -> Dict[str, Any]:
|
| 606 |
+
"""Perform detailed sentiment analysis on an article."""
|
| 607 |
+
# Use VADER for paragraph-level sentiment
|
| 608 |
+
paragraphs = article.content.split('\n')
|
| 609 |
+
paragraph_sentiments = []
|
| 610 |
+
|
| 611 |
+
overall_scores = {
|
| 612 |
+
'pos': 0,
|
| 613 |
+
'neg': 0,
|
| 614 |
+
'neu': 0,
|
| 615 |
+
'compound': 0
|
| 616 |
+
}
|
| 617 |
+
|
| 618 |
+
for paragraph in paragraphs:
|
| 619 |
+
if len(paragraph.strip()) < 20: # Skip short paragraphs
|
| 620 |
+
continue
|
| 621 |
+
|
| 622 |
+
scores = vader_analyzer.polarity_scores(paragraph)
|
| 623 |
+
paragraph_sentiments.append({
|
| 624 |
+
'text': paragraph[:100] + '...' if len(paragraph) > 100 else paragraph,
|
| 625 |
+
'scores': scores
|
| 626 |
+
})
|
| 627 |
+
|
| 628 |
+
overall_scores['pos'] += scores['pos']
|
| 629 |
+
overall_scores['neg'] += scores['neg']
|
| 630 |
+
overall_scores['neu'] += scores['neu']
|
| 631 |
+
overall_scores['compound'] += scores['compound']
|
| 632 |
+
|
| 633 |
+
num_paragraphs = len(paragraph_sentiments)
|
| 634 |
+
if num_paragraphs > 0:
|
| 635 |
+
overall_scores['pos'] /= num_paragraphs
|
| 636 |
+
overall_scores['neg'] /= num_paragraphs
|
| 637 |
+
overall_scores['neu'] /= num_paragraphs
|
| 638 |
+
overall_scores['compound'] /= num_paragraphs
|
| 639 |
+
|
| 640 |
+
# Use advanced model for overall sentiment
|
| 641 |
+
try:
|
| 642 |
+
# Truncate content if too long
|
| 643 |
+
truncated_content = article.content[:512] if len(article.content) > 512 else article.content
|
| 644 |
+
advanced_result = advanced_sentiment(truncated_content)[0]
|
| 645 |
+
advanced_sentiment_label = advanced_result['label']
|
| 646 |
+
advanced_confidence = advanced_result['score']
|
| 647 |
+
except Exception as e:
|
| 648 |
+
print(f"Error with advanced sentiment analysis: {str(e)}")
|
| 649 |
+
advanced_sentiment_label = "Error"
|
| 650 |
+
advanced_confidence = 0.0
|
| 651 |
+
|
| 652 |
+
# Determine final sentiment
|
| 653 |
+
if overall_scores['compound'] >= 0.05:
|
| 654 |
+
final_sentiment = "Positive"
|
| 655 |
+
elif overall_scores['compound'] <= -0.05:
|
| 656 |
+
final_sentiment = "Negative"
|
| 657 |
+
else:
|
| 658 |
+
final_sentiment = "Neutral"
|
| 659 |
+
|
| 660 |
+
return {
|
| 661 |
+
'article_title': article.title,
|
| 662 |
+
'overall_sentiment': final_sentiment,
|
| 663 |
+
'vader_scores': overall_scores,
|
| 664 |
+
'advanced_sentiment': {
|
| 665 |
+
'label': advanced_sentiment_label,
|
| 666 |
+
'confidence': advanced_confidence
|
| 667 |
+
},
|
| 668 |
+
'paragraph_analysis': paragraph_sentiments,
|
| 669 |
+
'positive_ratio': overall_scores['pos'],
|
| 670 |
+
'negative_ratio': overall_scores['neg'],
|
| 671 |
+
'neutral_ratio': overall_scores['neu']
|
| 672 |
+
}
|
| 673 |
+
|
| 674 |
+
def perform_comparative_analysis(articles: List[NewsArticle]) -> Dict[str, Any]:
|
| 675 |
+
"""Perform comparative analysis across multiple articles."""
|
| 676 |
+
# Sentiment distribution with expanded categories
|
| 677 |
+
sentiment_counts = {
|
| 678 |
+
"Positive": 0,
|
| 679 |
+
"Slightly Positive": 0,
|
| 680 |
+
"Neutral": 0,
|
| 681 |
+
"Slightly Negative": 0,
|
| 682 |
+
"Negative": 0
|
| 683 |
+
}
|
| 684 |
+
|
| 685 |
+
for article in articles:
|
| 686 |
+
if article.sentiment in sentiment_counts:
|
| 687 |
+
sentiment_counts[article.sentiment] += 1
|
| 688 |
+
else:
|
| 689 |
+
# Fallback for any unexpected sentiment values
|
| 690 |
+
sentiment_counts["Neutral"] += 1
|
| 691 |
+
|
| 692 |
+
# Topic analysis
|
| 693 |
+
all_topics = []
|
| 694 |
+
for article in articles:
|
| 695 |
+
all_topics.extend(article.topics)
|
| 696 |
+
|
| 697 |
+
topic_counts = Counter(all_topics)
|
| 698 |
+
common_topics = [topic for topic, count in topic_counts.most_common(10)]
|
| 699 |
+
|
| 700 |
+
# Identify unique topics per article
|
| 701 |
+
unique_topics_by_article = {}
|
| 702 |
+
for i, article in enumerate(articles):
|
| 703 |
+
other_articles_topics = []
|
| 704 |
+
for j, other_article in enumerate(articles):
|
| 705 |
+
if i != j:
|
| 706 |
+
other_articles_topics.extend(other_article.topics)
|
| 707 |
+
|
| 708 |
+
unique_topics = [topic for topic in article.topics if topic not in other_articles_topics]
|
| 709 |
+
unique_topics_by_article[i] = unique_topics
|
| 710 |
+
|
| 711 |
+
# Generate comparisons
|
| 712 |
+
comparisons = []
|
| 713 |
+
|
| 714 |
+
# If we have more than one article, generate meaningful comparisons
|
| 715 |
+
if len(articles) > 1:
|
| 716 |
+
for i in range(len(articles) - 1):
|
| 717 |
+
for j in range(i + 1, len(articles)):
|
| 718 |
+
article1 = articles[i]
|
| 719 |
+
article2 = articles[j]
|
| 720 |
+
|
| 721 |
+
# Compare sentiments - more nuanced now with new categories
|
| 722 |
+
if article1.sentiment != article2.sentiment:
|
| 723 |
+
# Group sentiments for better comparison
|
| 724 |
+
sent1_group = get_sentiment_group(article1.sentiment)
|
| 725 |
+
sent2_group = get_sentiment_group(article2.sentiment)
|
| 726 |
+
|
| 727 |
+
if sent1_group != sent2_group:
|
| 728 |
+
comparison = {
|
| 729 |
+
"Articles": [article1.title, article2.title],
|
| 730 |
+
"Comparison": f"'{article1.title}' presents a {sent1_group.lower()} view ({article1.sentiment}), while '{article2.title}' has a {sent2_group.lower()} view ({article2.sentiment}).",
|
| 731 |
+
"Impact": "This difference in sentiment highlights varying perspectives on the company's situation."
|
| 732 |
+
}
|
| 733 |
+
comparisons.append(comparison)
|
| 734 |
+
else:
|
| 735 |
+
# Even if in same group, note the difference if one is stronger
|
| 736 |
+
if "Slightly" in article1.sentiment and "Slightly" not in article2.sentiment or \
|
| 737 |
+
"Slightly" in article2.sentiment and "Slightly" not in article1.sentiment:
|
| 738 |
+
stronger = article1 if "Slightly" not in article1.sentiment else article2
|
| 739 |
+
weaker = article2 if stronger == article1 else article1
|
| 740 |
+
|
| 741 |
+
comparison = {
|
| 742 |
+
"Articles": [stronger.title, weaker.title],
|
| 743 |
+
"Comparison": f"'{stronger.title}' expresses a stronger {sent1_group.lower()} sentiment ({stronger.sentiment}) than '{weaker.title}' ({weaker.sentiment}).",
|
| 744 |
+
"Impact": "The difference in intensity suggests varying degrees of confidence about the company."
|
| 745 |
+
}
|
| 746 |
+
comparisons.append(comparison)
|
| 747 |
+
|
| 748 |
+
# Compare topics
|
| 749 |
+
common_topics_between_two = set(article1.topics).intersection(set(article2.topics))
|
| 750 |
+
if common_topics_between_two:
|
| 751 |
+
comparison = {
|
| 752 |
+
"Articles": [article1.title, article2.title],
|
| 753 |
+
"Comparison": f"Both articles discuss {', '.join(common_topics_between_two)}.",
|
| 754 |
+
"Impact": "The common topics indicate key areas of focus around the company."
|
| 755 |
+
}
|
| 756 |
+
comparisons.append(comparison)
|
| 757 |
+
|
| 758 |
+
# Compare unique topics
|
| 759 |
+
unique_to_article1 = set(article1.topics) - set(article2.topics)
|
| 760 |
+
unique_to_article2 = set(article2.topics) - set(article1.topics)
|
| 761 |
+
|
| 762 |
+
if unique_to_article1 and unique_to_article2:
|
| 763 |
+
comparison = {
|
| 764 |
+
"Articles": [article1.title, article2.title],
|
| 765 |
+
"Comparison": f"'{article1.title}' uniquely covers {', '.join(unique_to_article1)}, while '{article2.title}' focuses on {', '.join(unique_to_article2)}.",
|
| 766 |
+
"Impact": "Different sources emphasize varying aspects of the company, offering a broader perspective."
|
| 767 |
+
}
|
| 768 |
+
comparisons.append(comparison)
|
| 769 |
+
else:
|
| 770 |
+
# If we only have one article, create a dummy comparison
|
| 771 |
+
if articles:
|
| 772 |
+
article = articles[0]
|
| 773 |
+
topics_str = ", ".join(article.topics[:3]) if article.topics else "no specific topics"
|
| 774 |
+
sentiment_group = get_sentiment_group(article.sentiment)
|
| 775 |
+
|
| 776 |
+
comparisons = [
|
| 777 |
+
{
|
| 778 |
+
"Comparison": f"Only found one article: '{article.title}' with a {article.sentiment.lower()} sentiment ({sentiment_group} overall).",
|
| 779 |
+
"Impact": f"Limited coverage focused on {topics_str}. More articles would provide a more balanced view."
|
| 780 |
+
},
|
| 781 |
+
{
|
| 782 |
+
"Comparison": f"The article discusses {topics_str} in relation to {article.source}.",
|
| 783 |
+
"Impact": "Single source reporting limits perspective. Consider searching for additional sources."
|
| 784 |
+
}
|
| 785 |
+
]
|
| 786 |
+
|
| 787 |
+
# Generate overall sentiment analysis
|
| 788 |
+
# Combine slightly positive with positive and slightly negative with negative for summary
|
| 789 |
+
pos_count = sentiment_counts["Positive"] + sentiment_counts["Slightly Positive"]
|
| 790 |
+
neg_count = sentiment_counts["Negative"] + sentiment_counts["Slightly Negative"]
|
| 791 |
+
neu_count = sentiment_counts["Neutral"]
|
| 792 |
+
total = pos_count + neg_count + neu_count
|
| 793 |
+
|
| 794 |
+
# For display, we'll keep detailed counts but summarize the analysis text
|
| 795 |
+
if total == 0:
|
| 796 |
+
final_analysis = "No sentiment data available."
|
| 797 |
+
else:
|
| 798 |
+
pos_ratio = pos_count / total
|
| 799 |
+
neg_ratio = neg_count / total
|
| 800 |
+
|
| 801 |
+
# Show more details on the sentiment breakdown
|
| 802 |
+
sentiment_detail = []
|
| 803 |
+
if sentiment_counts["Positive"] > 0:
|
| 804 |
+
sentiment_detail.append(f"{sentiment_counts['Positive']} strongly positive")
|
| 805 |
+
if sentiment_counts["Slightly Positive"] > 0:
|
| 806 |
+
sentiment_detail.append(f"{sentiment_counts['Slightly Positive']} slightly positive")
|
| 807 |
+
if sentiment_counts["Neutral"] > 0:
|
| 808 |
+
sentiment_detail.append(f"{sentiment_counts['Neutral']} neutral")
|
| 809 |
+
if sentiment_counts["Slightly Negative"] > 0:
|
| 810 |
+
sentiment_detail.append(f"{sentiment_counts['Slightly Negative']} slightly negative")
|
| 811 |
+
if sentiment_counts["Negative"] > 0:
|
| 812 |
+
sentiment_detail.append(f"{sentiment_counts['Negative']} strongly negative")
|
| 813 |
+
|
| 814 |
+
sentiment_breakdown = ", ".join(sentiment_detail)
|
| 815 |
+
|
| 816 |
+
if pos_ratio > 0.6:
|
| 817 |
+
final_analysis = f"The company has primarily positive coverage ({pos_count}/{total} articles positive: {sentiment_breakdown}). This suggests a favorable market perception."
|
| 818 |
+
elif neg_ratio > 0.6:
|
| 819 |
+
final_analysis = f"The company has primarily negative coverage ({neg_count}/{total} articles negative: {sentiment_breakdown}). This could indicate challenges or controversies."
|
| 820 |
+
elif pos_ratio > neg_ratio:
|
| 821 |
+
final_analysis = f"The company has mixed coverage with a positive lean ({sentiment_breakdown})."
|
| 822 |
+
elif neg_ratio > pos_ratio:
|
| 823 |
+
final_analysis = f"The company has mixed coverage with a negative lean ({sentiment_breakdown})."
|
| 824 |
+
else:
|
| 825 |
+
final_analysis = f"The company has balanced coverage ({sentiment_breakdown})."
|
| 826 |
+
|
| 827 |
+
# If we only have the dummy article, customize the final analysis
|
| 828 |
+
if len(articles) == 1 and articles[0].url == "#":
|
| 829 |
+
final_analysis = "Limited news data available. The analysis is based on a placeholder article."
|
| 830 |
+
|
| 831 |
+
return {
|
| 832 |
+
"Sentiment Distribution": sentiment_counts,
|
| 833 |
+
"Common Topics": common_topics,
|
| 834 |
+
"Topic Overlap": {
|
| 835 |
+
"Common Topics Across All": common_topics[:5],
|
| 836 |
+
"Unique Topics By Article": unique_topics_by_article
|
| 837 |
+
},
|
| 838 |
+
"Coverage Differences": comparisons[:10], # Limit to top 10 comparisons
|
| 839 |
+
"Final Sentiment Analysis": final_analysis
|
| 840 |
+
}
|
| 841 |
+
|
| 842 |
+
def get_sentiment_group(sentiment: str) -> str:
|
| 843 |
+
"""Group sentiments into broader categories for comparison."""
|
| 844 |
+
if sentiment in ["Positive", "Slightly Positive"]:
|
| 845 |
+
return "Positive"
|
| 846 |
+
elif sentiment in ["Negative", "Slightly Negative"]:
|
| 847 |
+
return "Negative"
|
| 848 |
+
else:
|
| 849 |
+
return "Neutral"
|
| 850 |
+
|
| 851 |
+
def translate_to_hindi(text: str) -> str:
|
| 852 |
+
"""Translate text to Hindi using deep_translator."""
|
| 853 |
+
try:
|
| 854 |
+
# Split text into chunks if too long (Google Translator has a limit)
|
| 855 |
+
max_chunk_size = 4500 # deep_translator's GoogleTranslator has a limit of 5000 chars
|
| 856 |
+
chunks = [text[i:i+max_chunk_size] for i in range(0, len(text), max_chunk_size)]
|
| 857 |
+
|
| 858 |
+
translated_chunks = []
|
| 859 |
+
for chunk in chunks:
|
| 860 |
+
# Translate the chunk
|
| 861 |
+
translated = translator.translate(chunk)
|
| 862 |
+
translated_chunks.append(translated)
|
| 863 |
+
time.sleep(0.5) # Short delay to avoid rate limiting
|
| 864 |
+
|
| 865 |
+
return ''.join(translated_chunks)
|
| 866 |
+
except Exception as e:
|
| 867 |
+
print(f"Translation error: {str(e)}")
|
| 868 |
+
# Fallback to simple placeholder for Hindi text if translation fails
|
| 869 |
+
return "अनुवाद त्रुटि हुई।" # "Translation error occurred" in Hindi
|
| 870 |
+
|
| 871 |
+
def text_to_speech(text: str, output_file: str = 'output.mp3') -> str:
|
| 872 |
+
"""Convert text to speech in Hindi."""
|
| 873 |
+
try:
|
| 874 |
+
# Ensure output directory exists
|
| 875 |
+
output_dir = os.path.dirname(output_file)
|
| 876 |
+
if output_dir:
|
| 877 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 878 |
+
print(f"Ensuring output directory exists: {output_dir}")
|
| 879 |
+
|
| 880 |
+
# If text is too short, add some padding to avoid TTS errors
|
| 881 |
+
if len(text.strip()) < 5:
|
| 882 |
+
text = text + " " + "नमस्कार" * 3 # Add some padding text
|
| 883 |
+
print("Text was too short, adding padding")
|
| 884 |
+
|
| 885 |
+
print(f"Attempting to generate TTS for text of length {len(text)} characters")
|
| 886 |
+
|
| 887 |
+
# For long texts, split into chunks for better TTS quality
|
| 888 |
+
if len(text) > 3000:
|
| 889 |
+
print("Text is long, splitting into chunks for better TTS quality")
|
| 890 |
+
|
| 891 |
+
# Split at sentence boundaries
|
| 892 |
+
sentences = re.split(r'(।|\.|\?|\!)', text)
|
| 893 |
+
chunks = []
|
| 894 |
+
current_chunk = ""
|
| 895 |
+
|
| 896 |
+
# Combine sentences into chunks of appropriate size
|
| 897 |
+
for i in range(0, len(sentences), 2):
|
| 898 |
+
if i+1 < len(sentences): # Make sure we have the punctuation part
|
| 899 |
+
sentence = sentences[i] + sentences[i+1]
|
| 900 |
+
else:
|
| 901 |
+
sentence = sentences[i]
|
| 902 |
+
|
| 903 |
+
if len(current_chunk) + len(sentence) < 3000:
|
| 904 |
+
current_chunk += sentence
|
| 905 |
+
else:
|
| 906 |
+
if current_chunk:
|
| 907 |
+
chunks.append(current_chunk)
|
| 908 |
+
current_chunk = sentence
|
| 909 |
+
|
| 910 |
+
if current_chunk: # Add the last chunk
|
| 911 |
+
chunks.append(current_chunk)
|
| 912 |
+
|
| 913 |
+
print(f"Split text into {len(chunks)} chunks for TTS processing")
|
| 914 |
+
|
| 915 |
+
# Process each chunk and combine into one audio file
|
| 916 |
+
temp_files = []
|
| 917 |
+
for i, chunk in enumerate(chunks):
|
| 918 |
+
temp_output = f"{output_file}.part{i}.mp3"
|
| 919 |
+
try:
|
| 920 |
+
# Try gTTS for each chunk
|
| 921 |
+
tts = gTTS(text=chunk, lang='hi', slow=False)
|
| 922 |
+
tts.save(temp_output)
|
| 923 |
+
if os.path.exists(temp_output) and os.path.getsize(temp_output) > 0:
|
| 924 |
+
temp_files.append(temp_output)
|
| 925 |
+
else:
|
| 926 |
+
print(f"Failed to create chunk {i} with gTTS")
|
| 927 |
+
raise Exception(f"gTTS failed for chunk {i}")
|
| 928 |
+
except Exception as e:
|
| 929 |
+
print(f"Error with gTTS for chunk {i}: {str(e)}")
|
| 930 |
+
break
|
| 931 |
+
|
| 932 |
+
# If we have temp files, combine them
|
| 933 |
+
if temp_files:
|
| 934 |
+
try:
|
| 935 |
+
# Use pydub to concatenate audio files
|
| 936 |
+
from pydub import AudioSegment
|
| 937 |
+
combined = AudioSegment.empty()
|
| 938 |
+
for temp_file in temp_files:
|
| 939 |
+
audio = AudioSegment.from_mp3(temp_file)
|
| 940 |
+
combined += audio
|
| 941 |
+
|
| 942 |
+
combined.export(output_file, format="mp3")
|
| 943 |
+
|
| 944 |
+
# Clean up temp files
|
| 945 |
+
for temp_file in temp_files:
|
| 946 |
+
try:
|
| 947 |
+
os.remove(temp_file)
|
| 948 |
+
except:
|
| 949 |
+
pass
|
| 950 |
+
|
| 951 |
+
print(f"Successfully combined {len(temp_files)} audio chunks into {output_file}")
|
| 952 |
+
return output_file
|
| 953 |
+
except Exception as e:
|
| 954 |
+
print(f"Error combining audio files: {str(e)}")
|
| 955 |
+
# Try to return the first chunk at least
|
| 956 |
+
if os.path.exists(temp_files[0]):
|
| 957 |
+
import shutil
|
| 958 |
+
shutil.copy(temp_files[0], output_file)
|
| 959 |
+
print(f"Returning first chunk as fallback: {output_file}")
|
| 960 |
+
return output_file
|
| 961 |
+
|
| 962 |
+
# Method 1: Use gTTS for Hindi text-to-speech (for shorter texts or if chunking failed)
|
| 963 |
+
try:
|
| 964 |
+
print("Trying to use gTTS...")
|
| 965 |
+
tts = gTTS(text=text, lang='hi', slow=False)
|
| 966 |
+
tts.save(output_file)
|
| 967 |
+
|
| 968 |
+
# Verify the file was created and is not empty
|
| 969 |
+
if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
|
| 970 |
+
print(f"Successfully created audio file with gTTS: {output_file} (size: {os.path.getsize(output_file)} bytes)")
|
| 971 |
+
return output_file
|
| 972 |
+
else:
|
| 973 |
+
print(f"gTTS created a file but it may be empty or invalid: {output_file}")
|
| 974 |
+
raise Exception("Generated audio file is empty or invalid")
|
| 975 |
+
|
| 976 |
+
except Exception as e:
|
| 977 |
+
print(f"gTTS error: {str(e)}")
|
| 978 |
+
|
| 979 |
+
# Method 2: Fallback to pyttsx3
|
| 980 |
+
try:
|
| 981 |
+
print("Falling back to pyttsx3...")
|
| 982 |
+
engine = pyttsx3.init()
|
| 983 |
+
# Try to find a Hindi voice, or use default
|
| 984 |
+
voices = engine.getProperty('voices')
|
| 985 |
+
found_hindi_voice = False
|
| 986 |
+
|
| 987 |
+
for voice in voices:
|
| 988 |
+
print(f"Checking voice: {voice.name}")
|
| 989 |
+
if 'hindi' in voice.name.lower():
|
| 990 |
+
print(f"Found Hindi voice: {voice.name}")
|
| 991 |
+
engine.setProperty('voice', voice.id)
|
| 992 |
+
found_hindi_voice = True
|
| 993 |
+
break
|
| 994 |
+
|
| 995 |
+
if not found_hindi_voice:
|
| 996 |
+
print("No Hindi voice found, using default voice")
|
| 997 |
+
|
| 998 |
+
engine.save_to_file(text, output_file)
|
| 999 |
+
engine.runAndWait()
|
| 1000 |
+
|
| 1001 |
+
# Verify the file was created and is not empty
|
| 1002 |
+
if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
|
| 1003 |
+
print(f"Successfully created audio file with pyttsx3: {output_file} (size: {os.path.getsize(output_file)} bytes)")
|
| 1004 |
+
return output_file
|
| 1005 |
+
else:
|
| 1006 |
+
print(f"pyttsx3 created a file but it may be empty or invalid: {output_file}")
|
| 1007 |
+
raise Exception("Generated audio file is empty or invalid")
|
| 1008 |
+
|
| 1009 |
+
except Exception as e2:
|
| 1010 |
+
print(f"pyttsx3 error: {str(e2)}")
|
| 1011 |
+
|
| 1012 |
+
# If all TTS methods fail, create a simple notification sound as fallback
|
| 1013 |
+
try:
|
| 1014 |
+
print("Both TTS methods failed. Creating a simple audio notification instead.")
|
| 1015 |
+
# Generate a simple beep sound as a fallback (1 second, 440Hz)
|
| 1016 |
+
import numpy as np
|
| 1017 |
+
from scipy.io import wavfile
|
| 1018 |
+
|
| 1019 |
+
sample_rate = 44100
|
| 1020 |
+
duration = 1 # seconds
|
| 1021 |
+
t = np.linspace(0, duration, int(sample_rate * duration))
|
| 1022 |
+
|
| 1023 |
+
# Generate a simple tone
|
| 1024 |
+
frequency = 440 # Hz (A4 note)
|
| 1025 |
+
data = np.sin(2 * np.pi * frequency * t) * 32767
|
| 1026 |
+
data = data.astype(np.int16)
|
| 1027 |
+
|
| 1028 |
+
# Convert output_file from mp3 to wav
|
| 1029 |
+
wav_output_file = output_file.replace('.mp3', '.wav')
|
| 1030 |
+
wavfile.write(wav_output_file, sample_rate, data)
|
| 1031 |
+
|
| 1032 |
+
print(f"Created simple audio notification: {wav_output_file}")
|
| 1033 |
+
return wav_output_file
|
| 1034 |
+
|
| 1035 |
+
except Exception as e3:
|
| 1036 |
+
print(f"Failed to create fallback audio: {str(e3)}")
|
| 1037 |
+
return ""
|
| 1038 |
+
|
| 1039 |
+
return ""
|
| 1040 |
+
except Exception as e:
|
| 1041 |
+
print(f"TTS error: {str(e)}")
|
| 1042 |
+
return ""
|
| 1043 |
+
|
| 1044 |
+
def prepare_final_report(company_name: str, articles: List[NewsArticle],
|
| 1045 |
+
comparative_analysis: Dict[str, Any]) -> Dict[str, Any]:
|
| 1046 |
+
"""Prepare final report in the required format."""
|
| 1047 |
+
article_data = []
|
| 1048 |
+
|
| 1049 |
+
for article in articles:
|
| 1050 |
+
article_data.append({
|
| 1051 |
+
"Title": article.title,
|
| 1052 |
+
"Summary": article.summary,
|
| 1053 |
+
"Sentiment": article.sentiment,
|
| 1054 |
+
"Topics": article.topics
|
| 1055 |
+
})
|
| 1056 |
+
|
| 1057 |
+
# Prepare a more detailed summary for TTS with actual content from articles
|
| 1058 |
+
summary_text = f"{company_name} के बारे में समाचार विश्लेषण। "
|
| 1059 |
+
|
| 1060 |
+
# Add information about the number of articles found
|
| 1061 |
+
summary_text += f"कुल {len(articles)} लेख मिले। "
|
| 1062 |
+
|
| 1063 |
+
# Add sentiment distribution
|
| 1064 |
+
sentiment_counts = comparative_analysis["Sentiment Distribution"]
|
| 1065 |
+
pos_count = sentiment_counts["Positive"] + sentiment_counts["Slightly Positive"]
|
| 1066 |
+
neg_count = sentiment_counts["Negative"] + sentiment_counts["Slightly Negative"]
|
| 1067 |
+
neu_count = sentiment_counts["Neutral"]
|
| 1068 |
+
|
| 1069 |
+
if pos_count > 0 or neg_count > 0 or neu_count > 0:
|
| 1070 |
+
sentiment_detail = []
|
| 1071 |
+
if sentiment_counts["Positive"] > 0:
|
| 1072 |
+
sentiment_detail.append(f"{sentiment_counts['Positive']} पूर्ण सकारात्मक")
|
| 1073 |
+
if sentiment_counts["Slightly Positive"] > 0:
|
| 1074 |
+
sentiment_detail.append(f"{sentiment_counts['Slightly Positive']} हल्का सकारात्मक")
|
| 1075 |
+
if sentiment_counts["Neutral"] > 0:
|
| 1076 |
+
sentiment_detail.append(f"{sentiment_counts['Neutral']} तटस्थ")
|
| 1077 |
+
if sentiment_counts["Slightly Negative"] > 0:
|
| 1078 |
+
sentiment_detail.append(f"{sentiment_counts['Slightly Negative']} हल्का नकारात्मक")
|
| 1079 |
+
if sentiment_counts["Negative"] > 0:
|
| 1080 |
+
sentiment_detail.append(f"{sentiment_counts['Negative']} पूर्ण नकारात्मक")
|
| 1081 |
+
|
| 1082 |
+
summary_text += f"भावना विश्लेषण: {', '.join(sentiment_detail)}। "
|
| 1083 |
+
|
| 1084 |
+
# Add common topics with more detail
|
| 1085 |
+
common_topics = comparative_analysis["Common Topics"][:5]
|
| 1086 |
+
if common_topics:
|
| 1087 |
+
summary_text += f"मुख्य विषय हैं: {', '.join(common_topics)}। "
|
| 1088 |
+
|
| 1089 |
+
# Add more context about the common topics
|
| 1090 |
+
summary_text += "इन विषयों के बारे में लेखों में यह कहा गया है: "
|
| 1091 |
+
|
| 1092 |
+
# Find sentences related to common topics in the articles
|
| 1093 |
+
topic_sentences = []
|
| 1094 |
+
for topic in common_topics[:3]: # Focus on top 3 topics
|
| 1095 |
+
found = False
|
| 1096 |
+
for article in articles:
|
| 1097 |
+
if topic in article.content.lower():
|
| 1098 |
+
# Find sentences containing this topic
|
| 1099 |
+
sentences = sent_tokenize(article.content)
|
| 1100 |
+
for sentence in sentences:
|
| 1101 |
+
if topic in sentence.lower() and len(sentence) < 150:
|
| 1102 |
+
topic_sentences.append(f"{topic} के बारे में: {sentence}")
|
| 1103 |
+
found = True
|
| 1104 |
+
break
|
| 1105 |
+
if found:
|
| 1106 |
+
break
|
| 1107 |
+
|
| 1108 |
+
if topic_sentences:
|
| 1109 |
+
summary_text += " ".join(topic_sentences[:3]) + " "
|
| 1110 |
+
|
| 1111 |
+
# Add article summaries
|
| 1112 |
+
summary_text += "लेखों का सारांश: "
|
| 1113 |
+
for i, article in enumerate(articles[:3]): # Include up to 3 articles
|
| 1114 |
+
summary_text += f"लेख {i+1}: {article.title}. {article.summary[:200]}... "
|
| 1115 |
+
|
| 1116 |
+
# Add sentiment for this specific article
|
| 1117 |
+
summary_text += f"इस लेख का भावना: {article.sentiment}. "
|
| 1118 |
+
|
| 1119 |
+
# Add final sentiment analysis
|
| 1120 |
+
summary_text += comparative_analysis["Final Sentiment Analysis"]
|
| 1121 |
+
|
| 1122 |
+
# Translate the detailed summary to Hindi
|
| 1123 |
+
hindi_summary = translate_to_hindi(summary_text)
|
| 1124 |
+
|
| 1125 |
+
# Format the response according to the required format
|
| 1126 |
+
return {
|
| 1127 |
+
"Company": company_name,
|
| 1128 |
+
"Articles": article_data,
|
| 1129 |
+
"Comparative Sentiment Score": comparative_analysis,
|
| 1130 |
+
"Final Sentiment Analysis": comparative_analysis["Final Sentiment Analysis"],
|
| 1131 |
+
"Hindi Summary": hindi_summary
|
| 1132 |
+
}
|