Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -28,13 +28,40 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
| 28 |
from urllib.parse import urlparse, urljoin
|
| 29 |
import time
|
| 30 |
import random
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
app = Flask(__name__)
|
| 33 |
-
CORS(app
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
# Set up logging
|
| 36 |
logging.basicConfig(level=logging.INFO)
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
# Set your Google API key securely using an environment variable
|
| 39 |
google_api_key = os.getenv('GOOGLE_API_KEY')
|
| 40 |
genai.configure(api_key=google_api_key)
|
|
@@ -99,6 +126,9 @@ ENERGY_COMPANIES = [
|
|
| 99 |
"https://www.orano.group/en/"
|
| 100 |
]
|
| 101 |
|
|
|
|
|
|
|
|
|
|
| 102 |
def allowed_file(filename):
|
| 103 |
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
|
| 104 |
|
|
@@ -207,237 +237,345 @@ def analyze_document(text):
|
|
| 207 |
return response.text
|
| 208 |
|
| 209 |
def process_query(query, role=None, file_id=None):
|
| 210 |
-
logging.info(f"Processing query: {query}, role: {role}, file_id: {file_id}")
|
| 211 |
if file_id:
|
| 212 |
return answer_query_from_document(query, file_id)
|
| 213 |
else:
|
| 214 |
system_prompt = f"You are an AI assistant specializing in {role}." if role else "You are a helpful AI assistant."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
|
| 235 |
def scrape_company_news(url):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
try:
|
| 237 |
-
|
| 238 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
soup = BeautifulSoup(response.content, 'html.parser')
|
| 240 |
|
| 241 |
-
#
|
| 242 |
-
articles = soup.find_all('article')
|
|
|
|
|
|
|
| 243 |
|
| 244 |
news_items = []
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
|
|
|
| 249 |
news_items.append({
|
| 250 |
-
'title':
|
| 251 |
-
'url': urljoin(url,
|
| 252 |
'source': urlparse(url).netloc
|
| 253 |
})
|
| 254 |
-
|
| 255 |
return news_items
|
| 256 |
except Exception as e:
|
| 257 |
logging.error(f"Error scraping {url}: {str(e)}")
|
| 258 |
return []
|
| 259 |
|
| 260 |
-
def get_company_news():
|
| 261 |
-
with ThreadPoolExecutor(max_workers=10) as executor:
|
| 262 |
-
future_to_url = {executor.submit(scrape_company_news, url): url for url in ENERGY_COMPANIES}
|
| 263 |
-
all_company_news = []
|
| 264 |
-
for future in as_completed(future_to_url):
|
| 265 |
-
all_company_news.extend(future.result())
|
| 266 |
-
time.sleep(random.uniform(0.5, 1.5)) # Random delay to avoid overwhelming servers
|
| 267 |
-
return all_company_news
|
| 268 |
-
|
| 269 |
def get_energy_news(query):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
try:
|
| 271 |
-
|
| 272 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
except Exception as e:
|
| 274 |
logging.error(f"Error fetching news: {e}")
|
| 275 |
return []
|
| 276 |
|
| 277 |
-
def
|
| 278 |
-
source = item.get('source', {}).get('name') if not is_company_news else item.get('source')
|
| 279 |
-
title = item.get('title', 'No title')
|
| 280 |
-
content = item.get('description', '') or item.get('content', '') or ''
|
| 281 |
-
url = item.get('url', '#')
|
| 282 |
-
|
| 283 |
-
prompt = f"""
|
| 284 |
-
Analyze the following news item in the context of the energy market:
|
| 285 |
-
Query: {query}
|
| 286 |
-
Source: {source}
|
| 287 |
-
Title: {title}
|
| 288 |
-
Content: {content}
|
| 289 |
-
URL: {url}
|
| 290 |
-
|
| 291 |
-
Is this news item directly relevant to "{query}" in the context of the energy market?
|
| 292 |
-
Answer ONLY 'YES' or 'NO', followed by a brief explanation.
|
| 293 |
-
If YES, provide:
|
| 294 |
-
1. A concise 2-3 sentence summary of the news.
|
| 295 |
-
2. Key points (up to 3 bullet points).
|
| 296 |
-
3. Specific impact on the energy market related to {query} (1-2 sentences).
|
| 297 |
"""
|
| 298 |
-
|
|
|
|
|
|
|
|
|
|
| 299 |
try:
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
except Exception as e:
|
| 313 |
-
logging.error(f"Error
|
| 314 |
return None
|
| 315 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
def filter_and_analyze_news(query, articles, company_news):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
all_news = articles + company_news
|
| 318 |
filtered_and_analyzed_news = []
|
| 319 |
|
| 320 |
with ThreadPoolExecutor(max_workers=20) as executor:
|
| 321 |
-
future_to_item = {
|
| 322 |
-
|
|
|
|
|
|
|
|
|
|
| 323 |
result = future.result()
|
| 324 |
if result:
|
| 325 |
filtered_and_analyzed_news.append(result)
|
| 326 |
-
|
| 327 |
if len(filtered_and_analyzed_news) >= 20:
|
| 328 |
break
|
| 329 |
|
| 330 |
return filtered_and_analyzed_news
|
| 331 |
|
| 332 |
def generate_market_summary(query, filtered_news):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
if not filtered_news:
|
| 334 |
return f"No relevant news found for '{query}' in the energy market context."
|
| 335 |
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
combined_summary = "\n\n".join(summaries)
|
| 341 |
|
| 342 |
prompt = f"""
|
| 343 |
-
Based on the following
|
| 344 |
{combined_summary}
|
|
|
|
| 345 |
Provide a comprehensive market summary that:
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
4. Compares and contrasts information from general news sources and energy company announcements.
|
| 350 |
-
5. Identifies any discrepancies or complementary information between general news and company-specific news.
|
| 351 |
-
Keep the summary focused on factual information derived from the news articles and company announcements, without adding speculation or personal opinions.
|
| 352 |
-
Organize the summary into clear sections with appropriate subheadings.
|
| 353 |
"""
|
| 354 |
-
|
| 355 |
try:
|
| 356 |
response = model.generate_content(prompt)
|
| 357 |
return response.text.strip()
|
| 358 |
except Exception as e:
|
| 359 |
-
logging.error(f"Error generating market summary: {e}")
|
| 360 |
-
return
|
| 361 |
|
| 362 |
@app.route('/')
|
| 363 |
def index():
|
| 364 |
return render_template('index.html')
|
| 365 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 366 |
@app.route('/query', methods=['POST'])
|
|
|
|
| 367 |
def query():
|
| 368 |
data = request.json
|
|
|
|
|
|
|
|
|
|
| 369 |
query = data.get('query')
|
| 370 |
role = data.get('role')
|
| 371 |
file_id = data.get('file_id')
|
| 372 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 373 |
|
| 374 |
try:
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
prompt = f"""
|
| 378 |
-
You are an AI News Analyst specializing in the energy market. Use the following news context and your general knowledge to answer the query.
|
| 379 |
-
|
| 380 |
-
News Context:
|
| 381 |
-
Market Summary: {news_context.get('market_summary', 'No market summary available.')}
|
| 382 |
-
|
| 383 |
-
Top Articles:
|
| 384 |
-
{' '.join([f"- {article['title']}: {article['summary']}" for article in news_context.get('top_articles', [])])}
|
| 385 |
-
|
| 386 |
-
Query: {query}
|
| 387 |
-
|
| 388 |
-
Provide a comprehensive answer that:
|
| 389 |
-
1. Directly addresses the query using information from the news context.
|
| 390 |
-
2. Incorporates relevant general knowledge about the energy market.
|
| 391 |
-
3. Highlights any connections or insights between the query and the recent news.
|
| 392 |
-
4. Offers a balanced perspective, considering both general news and company-specific announcements.
|
| 393 |
-
5. Suggests potential implications or future trends based on the available information.
|
| 394 |
-
|
| 395 |
-
Format your response with clear headings and bullet points where appropriate.
|
| 396 |
-
"""
|
| 397 |
-
response = model.generate_content(prompt)
|
| 398 |
-
return jsonify({'response': response.text})
|
| 399 |
-
else:
|
| 400 |
-
# Handle regular queries as before
|
| 401 |
-
response = process_query(query, role, file_id)
|
| 402 |
-
return jsonify({'response': response})
|
| 403 |
except Exception as e:
|
| 404 |
-
logging.error(f"Error
|
| 405 |
return jsonify({'error': str(e)}), 500
|
| 406 |
|
| 407 |
@app.route('/upload', methods=['POST'])
|
|
|
|
| 408 |
def upload_file():
|
| 409 |
if 'file' not in request.files:
|
| 410 |
return jsonify({'error': 'No file part'}), 400
|
|
|
|
| 411 |
file = request.files['file']
|
| 412 |
if file.filename == '':
|
| 413 |
return jsonify({'error': 'No selected file'}), 400
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
try:
|
| 418 |
-
file_content = file.read()
|
| 419 |
-
logging.info(f"File uploaded successfully: {filename}")
|
| 420 |
-
extracted_text = process_document(file_content, filename)
|
| 421 |
-
text_chunks = get_text_chunks(extracted_text)
|
| 422 |
-
analysis = analyze_document(extracted_text)
|
| 423 |
-
|
| 424 |
-
file_id = len(files_storage) + 1
|
| 425 |
-
files_storage[file_id] = {
|
| 426 |
-
'filename': filename,
|
| 427 |
-
'file_data': base64.b64encode(file_content).decode('utf-8'),
|
| 428 |
-
'analysis': analysis
|
| 429 |
-
}
|
| 430 |
-
|
| 431 |
-
create_vector_store(text_chunks, file_id)
|
| 432 |
-
|
| 433 |
-
logging.info(f"File processing completed and saved to in-memory storage with ID: {file_id}")
|
| 434 |
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
|
| 440 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 441 |
|
| 442 |
@app.route('/plot', methods=['POST'])
|
| 443 |
def plot():
|
|
@@ -498,36 +636,101 @@ def process_dataframe_query(df, query):
|
|
| 498 |
|
| 499 |
@app.route('/fetch_news', methods=['POST'])
|
| 500 |
def fetch_news():
|
| 501 |
-
data = request.json
|
| 502 |
-
query = data.get('query')
|
| 503 |
try:
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
'
|
| 515 |
-
'
|
| 516 |
-
'source': article.get('source', 'Unknown'),
|
| 517 |
-
'summary': summary,
|
| 518 |
-
'is_company_news': article.get('is_company_news', False)
|
| 519 |
})
|
| 520 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 521 |
return jsonify({
|
| 522 |
-
'
|
| 523 |
-
'
|
| 524 |
-
'
|
| 525 |
})
|
|
|
|
| 526 |
except Exception as e:
|
| 527 |
-
logging.error(f"Error in fetch_news route: {str(e)}"
|
| 528 |
-
return jsonify({
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 529 |
|
| 530 |
if __name__ == '__main__':
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
from urllib.parse import urlparse, urljoin
|
| 29 |
import time
|
| 30 |
import random
|
| 31 |
+
from functools import wraps
|
| 32 |
+
from pathlib import Path
|
| 33 |
+
from dotenv import load_dotenv
|
| 34 |
+
from requests.adapters import HTTPAdapter
|
| 35 |
+
from urllib3.util.retry import Retry
|
| 36 |
+
from transformers import pipeline
|
| 37 |
+
import torch
|
| 38 |
+
|
| 39 |
+
load_dotenv() # Load environment variables from .env file
|
| 40 |
|
| 41 |
app = Flask(__name__)
|
| 42 |
+
CORS(app, resources={
|
| 43 |
+
r"/*": {
|
| 44 |
+
"origins": "*",
|
| 45 |
+
"methods": ["GET", "POST", "OPTIONS"],
|
| 46 |
+
"allow_headers": ["Content-Type"]
|
| 47 |
+
}
|
| 48 |
+
})
|
| 49 |
|
| 50 |
# Set up logging
|
| 51 |
logging.basicConfig(level=logging.INFO)
|
| 52 |
|
| 53 |
+
# Get the directory containing app.py
|
| 54 |
+
BASE_DIR = Path(__file__).resolve().parent
|
| 55 |
+
|
| 56 |
+
# Load environment variables from .env file
|
| 57 |
+
env_path = BASE_DIR / '.env'
|
| 58 |
+
load_dotenv(env_path)
|
| 59 |
+
|
| 60 |
+
# Add debug logging to check environment variables
|
| 61 |
+
print(f"Current working directory: {os.getcwd()}")
|
| 62 |
+
print(f"Env file exists: {os.path.exists('.env')}")
|
| 63 |
+
print(f"GOOGLE_API_KEY value: {os.getenv('GOOGLE_API_KEY')}")
|
| 64 |
+
|
| 65 |
# Set your Google API key securely using an environment variable
|
| 66 |
google_api_key = os.getenv('GOOGLE_API_KEY')
|
| 67 |
genai.configure(api_key=google_api_key)
|
|
|
|
| 126 |
"https://www.orano.group/en/"
|
| 127 |
]
|
| 128 |
|
| 129 |
+
# Initialize local summarization pipeline (using facebook/bart-large-cnn)
|
| 130 |
+
local_summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
| 131 |
+
|
| 132 |
def allowed_file(filename):
|
| 133 |
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
|
| 134 |
|
|
|
|
| 237 |
return response.text
|
| 238 |
|
| 239 |
def process_query(query, role=None, file_id=None):
|
|
|
|
| 240 |
if file_id:
|
| 241 |
return answer_query_from_document(query, file_id)
|
| 242 |
else:
|
| 243 |
system_prompt = f"You are an AI assistant specializing in {role}." if role else "You are a helpful AI assistant."
|
| 244 |
+
prompt = f"""
|
| 245 |
+
{system_prompt}
|
| 246 |
+
|
| 247 |
+
Please format your response using markdown with proper structure:
|
| 248 |
+
- Use '##' for main headings
|
| 249 |
+
- Use '**' for bold text
|
| 250 |
+
- Use bullet points ('*') for lists
|
| 251 |
+
- Add proper spacing between sections
|
| 252 |
+
- Structure the content hierarchically
|
| 253 |
+
- Use proper paragraphs with line breaks
|
| 254 |
+
|
| 255 |
+
Query: "{query}"
|
| 256 |
+
|
| 257 |
+
Remember to:
|
| 258 |
+
- Format the response clearly and professionally
|
| 259 |
+
- Use headings for different sections
|
| 260 |
+
- Break down complex information into digestible parts
|
| 261 |
+
- Use bold text for emphasis on key terms
|
| 262 |
+
- Maintain consistent spacing
|
| 263 |
+
"""
|
| 264 |
+
try:
|
| 265 |
+
response = model.generate_content(prompt)
|
| 266 |
+
return response.text
|
| 267 |
+
except Exception as e:
|
| 268 |
+
logging.error(f"Error generating content: {str(e)}", exc_info=True)
|
| 269 |
+
raise e
|
| 270 |
|
| 271 |
+
def local_summarize(text):
|
| 272 |
+
"""
|
| 273 |
+
Summarizes the given text using a local huggingface model.
|
| 274 |
+
If the text is too long, it splits the text into manageable chunks.
|
| 275 |
+
"""
|
| 276 |
+
# Maximum number of words (roughly) that the summarizer can handle
|
| 277 |
+
max_words = 800
|
| 278 |
+
words = text.split()
|
| 279 |
+
|
| 280 |
+
if len(words) > max_words:
|
| 281 |
+
# Split text into chunks of ~max_words tokens
|
| 282 |
+
chunks = []
|
| 283 |
+
chunk = []
|
| 284 |
+
for word in words:
|
| 285 |
+
chunk.append(word)
|
| 286 |
+
if len(chunk) >= max_words:
|
| 287 |
+
chunks.append(" ".join(chunk))
|
| 288 |
+
chunk = []
|
| 289 |
+
if chunk:
|
| 290 |
+
chunks.append(" ".join(chunk))
|
| 291 |
+
|
| 292 |
+
summaries = []
|
| 293 |
+
for chunk in chunks:
|
| 294 |
+
summary = local_summarizer(chunk, max_length=150, min_length=40, do_sample=False)[0]['summary_text']
|
| 295 |
+
summaries.append(summary)
|
| 296 |
+
return " ".join(summaries)
|
| 297 |
+
else:
|
| 298 |
+
return local_summarizer(text, max_length=150, min_length=40, do_sample=False)[0]['summary_text']
|
| 299 |
|
| 300 |
def scrape_company_news(url):
|
| 301 |
+
"""
|
| 302 |
+
Scrapes the top company news items from the given URL.
|
| 303 |
+
Uses a session with retries to mitigate timeouts or transient errors.
|
| 304 |
+
"""
|
| 305 |
try:
|
| 306 |
+
session = requests.Session()
|
| 307 |
+
retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
|
| 308 |
+
adapter = HTTPAdapter(max_retries=retries)
|
| 309 |
+
session.mount('https://', adapter)
|
| 310 |
+
session.mount('http://', adapter)
|
| 311 |
+
|
| 312 |
+
headers = {
|
| 313 |
+
'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
|
| 314 |
+
'AppleWebKit/537.36 (KHTML, like Gecko) '
|
| 315 |
+
'Chrome/91.0.4472.124 Safari/537.36')
|
| 316 |
+
}
|
| 317 |
+
response = session.get(url, headers=headers, timeout=10)
|
| 318 |
soup = BeautifulSoup(response.content, 'html.parser')
|
| 319 |
|
| 320 |
+
# Combine results from multiple selectors
|
| 321 |
+
articles = soup.find_all('article')
|
| 322 |
+
articles.extend(soup.find_all('div', class_='news-item'))
|
| 323 |
+
articles.extend(soup.find_all('div', class_='press-release'))
|
| 324 |
|
| 325 |
news_items = []
|
| 326 |
+
# Only take the first 5 items (adjust as needed)
|
| 327 |
+
for article in articles[:5]:
|
| 328 |
+
title_elem = article.find('h2') or article.find('h3') or article.find('a')
|
| 329 |
+
link_elem = article.find('a')
|
| 330 |
+
if title_elem and link_elem and link_elem.has_attr('href'):
|
| 331 |
news_items.append({
|
| 332 |
+
'title': title_elem.get_text(strip=True),
|
| 333 |
+
'url': urljoin(url, link_elem['href']),
|
| 334 |
'source': urlparse(url).netloc
|
| 335 |
})
|
|
|
|
| 336 |
return news_items
|
| 337 |
except Exception as e:
|
| 338 |
logging.error(f"Error scraping {url}: {str(e)}")
|
| 339 |
return []
|
| 340 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
def get_energy_news(query):
|
| 342 |
+
"""
|
| 343 |
+
Fetches the latest news articles from NewsData.io API based on the query.
|
| 344 |
+
"""
|
| 345 |
+
logging.info(f"Starting news fetch for query: {query}")
|
| 346 |
+
|
| 347 |
+
news_data_api_key = os.getenv('NEWSDATA_API_KEY')
|
| 348 |
+
if not news_data_api_key:
|
| 349 |
+
logging.error("NewsData API key not found in environment variables")
|
| 350 |
+
return []
|
| 351 |
+
|
| 352 |
+
endpoint = "https://newsdata.io/api/1/news"
|
| 353 |
+
|
| 354 |
+
params = {
|
| 355 |
+
'apikey': news_data_api_key,
|
| 356 |
+
'q': query,
|
| 357 |
+
'country': 'us',
|
| 358 |
+
'language': 'en',
|
| 359 |
+
'category': 'business'
|
| 360 |
+
}
|
| 361 |
+
|
| 362 |
+
logging.info(f"Making API request to: {endpoint}")
|
| 363 |
+
logging.info(f"With parameters: {params}")
|
| 364 |
+
|
| 365 |
try:
|
| 366 |
+
response = requests.get(endpoint, params=params, timeout=10)
|
| 367 |
+
logging.info(f"API Response status code: {response.status_code}")
|
| 368 |
+
|
| 369 |
+
response.raise_for_status()
|
| 370 |
+
data = response.json()
|
| 371 |
+
|
| 372 |
+
if data.get("status") == "success":
|
| 373 |
+
articles = data.get("results", [])
|
| 374 |
+
logging.info(f"Successfully fetched {len(articles)} articles")
|
| 375 |
+
return articles
|
| 376 |
+
else:
|
| 377 |
+
logging.error(f"NewsData API error response: {data}")
|
| 378 |
+
return []
|
| 379 |
+
|
| 380 |
except Exception as e:
|
| 381 |
logging.error(f"Error fetching news: {e}")
|
| 382 |
return []
|
| 383 |
|
| 384 |
+
def robust_analyze_news_item(item, query):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 385 |
"""
|
| 386 |
+
Analyzes a news item using the generative model with better error handling.
|
| 387 |
+
"""
|
| 388 |
+
logging.info(f"Starting analysis for article: {item.get('title', 'No title')}")
|
| 389 |
+
|
| 390 |
try:
|
| 391 |
+
# Extract article information
|
| 392 |
+
title = item.get('title', '')
|
| 393 |
+
content = item.get('description', '') or item.get('content', '')
|
| 394 |
+
source = item.get('source_id', 'Unknown Source')
|
| 395 |
+
url = item.get('link', '#')
|
| 396 |
+
|
| 397 |
+
logging.info(f"Article details - Title: {title[:100]}...")
|
| 398 |
+
logging.info(f"Content length: {len(content)} characters")
|
| 399 |
+
|
| 400 |
+
# Skip if no meaningful content
|
| 401 |
+
if not content or len(content.strip()) < 10:
|
| 402 |
+
logging.warning(f"Skipping article due to insufficient content: {title}")
|
| 403 |
+
return None
|
| 404 |
+
|
| 405 |
+
prompt = f"""
|
| 406 |
+
Analyze this news article:
|
| 407 |
+
Title: {title}
|
| 408 |
+
Content: {content}
|
| 409 |
+
|
| 410 |
+
Provide a brief analysis in the following format:
|
| 411 |
+
1. Summary (2-3 sentences)
|
| 412 |
+
2. Key Points (up to 3 bullet points)
|
| 413 |
+
3. Market Impact (1-2 sentences about potential market implications)
|
| 414 |
+
"""
|
| 415 |
+
|
| 416 |
+
logging.info("Attempting analysis with generative model")
|
| 417 |
+
try:
|
| 418 |
+
response = model.generate_content(prompt)
|
| 419 |
+
analysis = response.text.strip()
|
| 420 |
+
logging.info("Successfully generated analysis with model")
|
| 421 |
+
except Exception as e:
|
| 422 |
+
logging.warning(f"Generative model failed: {str(e)}")
|
| 423 |
+
logging.info("Falling back to local summarizer")
|
| 424 |
+
analysis = local_summarize(content)
|
| 425 |
+
|
| 426 |
+
result = {
|
| 427 |
+
'title': title,
|
| 428 |
+
'link': url,
|
| 429 |
+
'source': source,
|
| 430 |
+
'analysis': analysis
|
| 431 |
+
}
|
| 432 |
+
logging.info("Successfully created analysis result")
|
| 433 |
+
return result
|
| 434 |
+
|
| 435 |
except Exception as e:
|
| 436 |
+
logging.error(f"Error in robust_analyze_news_item: {str(e)}")
|
| 437 |
return None
|
| 438 |
|
| 439 |
+
def get_company_news():
|
| 440 |
+
with ThreadPoolExecutor(max_workers=10) as executor:
|
| 441 |
+
future_to_url = {executor.submit(scrape_company_news, url): url for url in ENERGY_COMPANIES}
|
| 442 |
+
all_company_news = []
|
| 443 |
+
for future in as_completed(future_to_url):
|
| 444 |
+
all_company_news.extend(future.result())
|
| 445 |
+
time.sleep(random.uniform(0.5, 1.5)) # Random delay to avoid overwhelming servers
|
| 446 |
+
return all_company_news
|
| 447 |
+
|
| 448 |
def filter_and_analyze_news(query, articles, company_news):
|
| 449 |
+
"""
|
| 450 |
+
Processes both News API results and scraped company news.
|
| 451 |
+
Uses robust_analyze_news_item so that any API errors are handled gracefully.
|
| 452 |
+
"""
|
| 453 |
all_news = articles + company_news
|
| 454 |
filtered_and_analyzed_news = []
|
| 455 |
|
| 456 |
with ThreadPoolExecutor(max_workers=20) as executor:
|
| 457 |
+
future_to_item = {
|
| 458 |
+
executor.submit(robust_analyze_news_item, item, query): item
|
| 459 |
+
for item in all_news
|
| 460 |
+
}
|
| 461 |
+
for future in as_completed(future_to_item):
|
| 462 |
result = future.result()
|
| 463 |
if result:
|
| 464 |
filtered_and_analyzed_news.append(result)
|
|
|
|
| 465 |
if len(filtered_and_analyzed_news) >= 20:
|
| 466 |
break
|
| 467 |
|
| 468 |
return filtered_and_analyzed_news
|
| 469 |
|
| 470 |
def generate_market_summary(query, filtered_news):
|
| 471 |
+
"""
|
| 472 |
+
Generates an overall market summary using the individual news analyses.
|
| 473 |
+
Uses the generative model but falls back to local summarization in case of errors.
|
| 474 |
+
"""
|
| 475 |
if not filtered_news:
|
| 476 |
return f"No relevant news found for '{query}' in the energy market context."
|
| 477 |
|
| 478 |
+
# Combine the analyses from each news item for context
|
| 479 |
+
summaries = []
|
| 480 |
+
for item in filtered_news:
|
| 481 |
+
summaries.append(f"Title: {item.get('title', 'No title')}\nAnalysis: {item.get('analysis', '')}\n")
|
| 482 |
combined_summary = "\n\n".join(summaries)
|
| 483 |
|
| 484 |
prompt = f"""
|
| 485 |
+
Based on the following news analyses:
|
| 486 |
{combined_summary}
|
| 487 |
+
|
| 488 |
Provide a comprehensive market summary that:
|
| 489 |
+
- Highlights current trends related to {query} in the energy market.
|
| 490 |
+
- Identifies key insights and potential market impacts.
|
| 491 |
+
- Organizes the information into clearly defined sections.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 492 |
"""
|
|
|
|
| 493 |
try:
|
| 494 |
response = model.generate_content(prompt)
|
| 495 |
return response.text.strip()
|
| 496 |
except Exception as e:
|
| 497 |
+
logging.error(f"Error generating market summary using API: {e}. Falling back to local summarization.")
|
| 498 |
+
return local_summarize(combined_summary)
|
| 499 |
|
| 500 |
@app.route('/')
|
| 501 |
def index():
|
| 502 |
return render_template('index.html')
|
| 503 |
|
| 504 |
+
# Add error handling decorator
|
| 505 |
+
def handle_errors(f):
|
| 506 |
+
@wraps(f)
|
| 507 |
+
def wrapper(*args, **kwargs):
|
| 508 |
+
try:
|
| 509 |
+
return f(*args, **kwargs)
|
| 510 |
+
except Exception as e:
|
| 511 |
+
logging.error(f"Error in {f.__name__}: {str(e)}", exc_info=True)
|
| 512 |
+
return jsonify({'error': str(e)}), 500
|
| 513 |
+
return wrapper
|
| 514 |
+
|
| 515 |
@app.route('/query', methods=['POST'])
|
| 516 |
+
@handle_errors
|
| 517 |
def query():
|
| 518 |
data = request.json
|
| 519 |
+
if not data:
|
| 520 |
+
return jsonify({'error': 'No data provided'}), 400
|
| 521 |
+
|
| 522 |
query = data.get('query')
|
| 523 |
role = data.get('role')
|
| 524 |
file_id = data.get('file_id')
|
| 525 |
+
|
| 526 |
+
if not query:
|
| 527 |
+
return jsonify({'error': 'No query provided'}), 400
|
| 528 |
+
if not role:
|
| 529 |
+
return jsonify({'error': 'No role provided'}), 400
|
| 530 |
|
| 531 |
try:
|
| 532 |
+
response = process_query(query, role, file_id)
|
| 533 |
+
return jsonify({'response': response})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 534 |
except Exception as e:
|
| 535 |
+
logging.error(f"Error processing query: {str(e)}", exc_info=True)
|
| 536 |
return jsonify({'error': str(e)}), 500
|
| 537 |
|
| 538 |
@app.route('/upload', methods=['POST'])
|
| 539 |
+
@handle_errors
|
| 540 |
def upload_file():
|
| 541 |
if 'file' not in request.files:
|
| 542 |
return jsonify({'error': 'No file part'}), 400
|
| 543 |
+
|
| 544 |
file = request.files['file']
|
| 545 |
if file.filename == '':
|
| 546 |
return jsonify({'error': 'No selected file'}), 400
|
| 547 |
+
|
| 548 |
+
if not allowed_file(file.filename):
|
| 549 |
+
return jsonify({'error': 'Invalid file type'}), 400
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 550 |
|
| 551 |
+
try:
|
| 552 |
+
file_content = file.read()
|
| 553 |
+
filename = secure_filename(file.filename)
|
| 554 |
+
|
| 555 |
+
# Process the file
|
| 556 |
+
extracted_text = process_document(file_content, filename)
|
| 557 |
+
text_chunks = get_text_chunks(extracted_text)
|
| 558 |
+
analysis = analyze_document(extracted_text)
|
| 559 |
+
|
| 560 |
+
# Generate file ID and store
|
| 561 |
+
file_id = len(files_storage) + 1
|
| 562 |
+
files_storage[file_id] = {
|
| 563 |
+
'filename': filename,
|
| 564 |
+
'file_data': base64.b64encode(file_content).decode('utf-8'),
|
| 565 |
+
'analysis': analysis
|
| 566 |
+
}
|
| 567 |
+
|
| 568 |
+
# Create vector store
|
| 569 |
+
create_vector_store(text_chunks, file_id)
|
| 570 |
|
| 571 |
+
return jsonify({
|
| 572 |
+
'file_id': file_id,
|
| 573 |
+
'analysis': analysis,
|
| 574 |
+
'message': 'File processed successfully'
|
| 575 |
+
})
|
| 576 |
+
except Exception as e:
|
| 577 |
+
logging.error(f"Error processing file: {str(e)}", exc_info=True)
|
| 578 |
+
return jsonify({'error': str(e)}), 500
|
| 579 |
|
| 580 |
@app.route('/plot', methods=['POST'])
|
| 581 |
def plot():
|
|
|
|
| 636 |
|
| 637 |
@app.route('/fetch_news', methods=['POST'])
|
| 638 |
def fetch_news():
|
|
|
|
|
|
|
| 639 |
try:
|
| 640 |
+
data = request.json
|
| 641 |
+
query = data.get('query', '')
|
| 642 |
+
|
| 643 |
+
# Fetch articles from NewsData.io
|
| 644 |
+
articles = get_energy_news(query)
|
| 645 |
+
|
| 646 |
+
if not articles:
|
| 647 |
+
return jsonify({
|
| 648 |
+
'status': 'error',
|
| 649 |
+
'message': 'No articles found',
|
| 650 |
+
'articles': [],
|
| 651 |
+
'summary': f"No relevant news found for '{query}'"
|
|
|
|
|
|
|
|
|
|
| 652 |
})
|
| 653 |
|
| 654 |
+
# Analyze articles
|
| 655 |
+
analyzed_articles = []
|
| 656 |
+
for article in articles:
|
| 657 |
+
analysis = robust_analyze_news_item(article, query)
|
| 658 |
+
if analysis:
|
| 659 |
+
analyzed_articles.append(analysis)
|
| 660 |
+
|
| 661 |
+
if not analyzed_articles:
|
| 662 |
+
return jsonify({
|
| 663 |
+
'status': 'error',
|
| 664 |
+
'message': 'No articles could be analyzed',
|
| 665 |
+
'articles': [],
|
| 666 |
+
'summary': f"Could not analyze any articles for '{query}'"
|
| 667 |
+
})
|
| 668 |
+
|
| 669 |
+
# Generate market summary
|
| 670 |
+
summary_prompt = f"""
|
| 671 |
+
Based on the following analyzed news articles about "{query}":
|
| 672 |
+
|
| 673 |
+
{' '.join([f"Article: {a['title']}\nAnalysis: {a['analysis']}\n\n" for a in analyzed_articles[:5]])}
|
| 674 |
+
|
| 675 |
+
Provide a comprehensive market summary that:
|
| 676 |
+
1. Highlights the main trends and developments
|
| 677 |
+
2. Identifies potential market impacts
|
| 678 |
+
3. Suggests key takeaways for stakeholders
|
| 679 |
+
"""
|
| 680 |
+
|
| 681 |
+
try:
|
| 682 |
+
summary_response = model.generate_content(summary_prompt)
|
| 683 |
+
market_summary = summary_response.text.strip()
|
| 684 |
+
except Exception as e:
|
| 685 |
+
logging.error(f"Error generating market summary: {str(e)}")
|
| 686 |
+
market_summary = "Unable to generate market summary due to an error."
|
| 687 |
+
|
| 688 |
return jsonify({
|
| 689 |
+
'status': 'success',
|
| 690 |
+
'articles': analyzed_articles,
|
| 691 |
+
'summary': market_summary
|
| 692 |
})
|
| 693 |
+
|
| 694 |
except Exception as e:
|
| 695 |
+
logging.error(f"Error in fetch_news route: {str(e)}")
|
| 696 |
+
return jsonify({
|
| 697 |
+
'status': 'error',
|
| 698 |
+
'message': str(e),
|
| 699 |
+
'articles': [],
|
| 700 |
+
'summary': "An error occurred while processing the news request."
|
| 701 |
+
}), 500
|
| 702 |
+
|
| 703 |
+
# Add health check endpoint
|
| 704 |
+
@app.route('/health', methods=['GET'])
|
| 705 |
+
def health_check():
|
| 706 |
+
return jsonify({'status': 'healthy', 'api_key_configured': bool(google_api_key)})
|
| 707 |
+
|
| 708 |
+
# Ensure all required environment variables are set
|
| 709 |
+
def check_environment():
|
| 710 |
+
required_vars = ['GOOGLE_API_KEY']
|
| 711 |
+
missing_vars = [var for var in required_vars if not os.getenv(var)]
|
| 712 |
+
if missing_vars:
|
| 713 |
+
raise EnvironmentError(f"Missing required environment variables: {', '.join(missing_vars)}")
|
| 714 |
|
| 715 |
if __name__ == '__main__':
|
| 716 |
+
try:
|
| 717 |
+
# Check environment variables
|
| 718 |
+
check_environment()
|
| 719 |
+
|
| 720 |
+
# Configure logging
|
| 721 |
+
logging.basicConfig(
|
| 722 |
+
level=logging.INFO,
|
| 723 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 724 |
+
)
|
| 725 |
+
|
| 726 |
+
# Initialize Google AI
|
| 727 |
+
if not google_api_key:
|
| 728 |
+
raise ValueError("GOOGLE_API_KEY not configured")
|
| 729 |
+
genai.configure(api_key=google_api_key)
|
| 730 |
+
|
| 731 |
+
# Start server
|
| 732 |
+
port = int(os.environ.get('PORT', 7860))
|
| 733 |
+
app.run(host='0.0.0.0', port=port, debug=True)
|
| 734 |
+
except Exception as e:
|
| 735 |
+
logging.error(f"Failed to start server: {str(e)}", exc_info=True)
|
| 736 |
+
raise
|