AliInamdar commited on
Commit
5f62365
·
verified ·
1 Parent(s): 27caad9

Upload helper.py

Browse files
Files changed (1) hide show
  1. helper.py +298 -0
helper.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Helper.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1LWss_gahHvpiSsp7PsZRKTEsdRttjuAq
8
+ """
9
+
10
+ import asyncio
11
+ import json
12
+ import os
13
+ import subprocess
14
+ import urllib
15
+ from datetime import datetime
16
+ from typing import Dict, List, Any, Optional
17
+ import requests
18
+ import re
19
+ from bs4 import BeautifulSoup
20
+ from gtts import gTTS
21
+ #from logger.app_logger import app_logger
22
+
23
+ # logger/app_logger.py
24
+ import logging
25
+
26
+ # Create a logger instance
27
+ app_logger = logging.getLogger(__name__)
28
+
29
+ # Set the logging level (e.g., DEBUG, INFO, WARNING, ERROR, CRITICAL)
30
+ app_logger.setLevel(logging.DEBUG)
31
+
32
+ # Create a handler (e.g., to write logs to a file or the console)
33
+ handler = logging.StreamHandler() # Outputs logs to the console
34
+
35
+ # Create a formatter to specify the log message format
36
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
37
+ handler.setFormatter(formatter)
38
+
39
+ # Add the handler to the logger
40
+ app_logger.addHandler(handler)
41
+
42
+ # Now you can use the logger in your other modules
43
+ # Example:
44
+ # app_logger.info("This is an informational message.")
45
+
46
+ !pip install gTTS
47
+
48
+
49
+
50
+ import os
51
+
52
+ # Create the 'logger' directory if it doesn't exist
53
+ if not os.path.exists('logger'):
54
+ os.makedirs('logger')
55
+
56
+ # Create an empty 'app_logger.py' file if it doesn't exist
57
+ if not os.path.exists('logger/app_logger.py'):
58
+ with open('logger/app_logger.py', 'w') as f:
59
+ pass # Leave the file empty for now
60
+
61
+ class ChatBot:
62
+ """
63
+ A chatbot class that interacts with a local Llama model using Ollama.
64
+ """
65
+
66
+ def __init__(self) -> None:
67
+ """Initialize the ChatBot instance with a conversation history."""
68
+ self.history: List[Dict[str, str]] = [{"role": "system", "content": "You are a helpful assistant."}]
69
+ app_logger.log_info("ChatBot instance initialized", level="INFO")
70
+
71
+ def generate_response(self, prompt: str) -> str:
72
+ """
73
+ Generate a response from the chatbot based on the user's prompt.
74
+
75
+ Args:
76
+ prompt (str): The input message from the user.
77
+
78
+ Returns:
79
+ str: The chatbot's response to the provided prompt.
80
+ """
81
+ self.history.append({"role": "user", "content": prompt})
82
+ # app_logger.log_info(f"User prompt added to history: {prompt}", level="INFO")
83
+ app_logger.log_info("User prompt added to history", level="INFO")
84
+
85
+ # Convert chat history into a string for subprocess input
86
+ conversation = "\n".join(f"{msg['role']}: {msg['content']}" for msg in self.history)
87
+
88
+ try:
89
+ # Run the Llama model using Ollama
90
+ completion = subprocess.run(
91
+ ["ollama", "run", "llama3.2:latest"],
92
+ input=conversation,
93
+ capture_output=True,
94
+ text=True,
95
+ )
96
+
97
+ if completion.returncode != 0:
98
+ app_logger.log_error(f"Error running subprocess: {completion.stderr}")
99
+ return "I'm sorry, I encountered an issue processing your request."
100
+
101
+ response = completion.stdout.strip()
102
+ self.history.append({"role": "assistant", "content": response})
103
+ # app_logger.log_info(f"Assistant response generated: {response}", level="INFO")
104
+ app_logger.log_info("Assistant response generated", level="INFO")
105
+
106
+ return response
107
+
108
+ except Exception as e:
109
+ app_logger.log_error(f"Error sending query to the model: {e}")
110
+ return "I'm sorry, an error occurred while processing your request."
111
+
112
+ async def rate_body_of_article(self, article_title: str, article_content: str) -> str:
113
+ """
114
+ Rate the quality of an article's content based on its title.
115
+
116
+ Args:
117
+ article_title (str): The title of the article.
118
+ article_content (str): The full content of the article.
119
+
120
+ Returns:
121
+ str: A rating between 1 and 5 based on relevance and quality.
122
+ """
123
+ prompt = f"""
124
+ Given the following article title and content, provide a rating between 1 and 5
125
+ based on how well the content aligns with the title and its overall quality.
126
+
127
+ - **Article Title**: {article_title}
128
+ - **Article Content**: {article_content[:1000]} # Limit to first 1000 chars
129
+
130
+ **Instructions:**
131
+ - The rating should be a whole number between 1 and 5.
132
+ - Base your score on accuracy, clarity, and relevance.
133
+ - Only return a single numeric value (1-5) with no extra text.
134
+
135
+ **Example Output:**
136
+ `4` or `2` or `3.5` or `1.5`
137
+ """
138
+
139
+ try:
140
+ # Run the Llama model using Ollama
141
+ completion = subprocess.run(
142
+ ["ollama", "run", "llama3.2:latest"],
143
+ input=prompt,
144
+ capture_output=True,
145
+ text=True,
146
+ )
147
+
148
+ if completion.returncode != 0:
149
+ app_logger.log_error(f"Error running subprocess: {completion.stderr}")
150
+ return "Error"
151
+
152
+ response = completion.stdout.strip()
153
+
154
+ # Validate the rating is within the expected range
155
+ if response.isdigit() and 1 <= int(response) <= 5:
156
+ self.history.append({"role": "assistant", "content": response})
157
+ app_logger.log_info(f"Article rated: {response}", level="INFO")
158
+ return response
159
+ else:
160
+ app_logger.log_warning(f"Invalid rating received: {response}")
161
+ return "Error"
162
+
163
+ except Exception as e:
164
+ app_logger.log_error(f"Error sending query to the model: {e}")
165
+ return "Error"
166
+
167
+
168
+ # ============================ EXTRACT NEWS BODY ============================
169
+
170
+ def extract_news_body(news_url: str) -> str:
171
+ """
172
+ Extract the full article body from a given news URL.
173
+
174
+ Args:
175
+ news_url (str): The URL of the news article.
176
+
177
+ Returns:
178
+ str: Extracted full article content.
179
+ """
180
+ try:
181
+ headers = {
182
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
183
+ }
184
+
185
+ response = requests.get(news_url, headers=headers, timeout=5)
186
+ if response.status_code != 200:
187
+ app_logger.log_error(f"Failed to fetch article: {response.status_code}")
188
+ return "Failed to fetch article."
189
+
190
+ soup = BeautifulSoup(response.text, "html.parser")
191
+ paragraphs = soup.find_all("p")
192
+
193
+ # Extract and return cleaned text
194
+ article_content = "\n".join([p.text.strip() for p in paragraphs if p.text.strip()])
195
+ app_logger.log_info(f"Article content extracted from {news_url}", level="INFO")
196
+ return article_content
197
+
198
+ except Exception as e:
199
+ app_logger.log_error(f"Error extracting article content: {e}")
200
+ return f"Error extracting article content: {e}"
201
+
202
+
203
+ # ============================ ASYNC NEWS SCRAPING ============================
204
+
205
+ async def invoke_duckduckgo_news_search(query: str, num: int = 5, location: str = "us-en", time_filter: str = "w") -> \
206
+ Dict[str, Any]:
207
+ """
208
+ Perform a DuckDuckGo News search, extract news headlines, fetch full content,
209
+ and rate articles using parallel asynchronous processing.
210
+
211
+ Args:
212
+ query (str): The search query string.
213
+ num (int): Number of search results to retrieve.
214
+ location (str): The region code for location-based results (e.g., 'us-en', 'in-en').
215
+ time_filter (str): Time filter for news ('d' = past day, 'w' = past week, 'm' = past month, 'y' = past year).
216
+
217
+ Returns:
218
+ Dict[str, Any]: A dictionary containing extracted news articles.
219
+ """
220
+ app_logger.log_info(f"Starting DuckDuckGo news search for query: {query}", level="INFO")
221
+
222
+ duckduckgo_news_url = f"https://duckduckgo.com/html/?q={query.replace(' ', '+')}&kl={location}&df={time_filter}&ia=news"
223
+ headers = {"User-Agent": "Mozilla/5.0"}
224
+
225
+ response = requests.get(duckduckgo_news_url, headers=headers)
226
+ if response.status_code != 200:
227
+ app_logger.log_error(f"Failed to fetch news search results: {response.status_code}")
228
+ return {"status": "error", "message": "Failed to fetch news search results"}
229
+
230
+ soup = BeautifulSoup(response.text, "html.parser")
231
+ search_results = soup.find_all("div", class_="result__body")
232
+
233
+ async def process_article(result, index: int) -> Optional[Dict[str, Any]]:
234
+ """Processes a single article: extracts details, fetches content, and rates it."""
235
+ try:
236
+ title_tag = result.find("a", class_="result__a")
237
+ if not title_tag:
238
+ app_logger.log_warning(f"Title tag not found for result index {index}")
239
+ return None
240
+
241
+ title = title_tag.text.strip()
242
+ raw_link = title_tag["href"]
243
+
244
+ match = re.search(r"uddg=(https?%3A%2F%2F[^&]+)", raw_link)
245
+ link = urllib.parse.unquote(match.group(1)) if match else "Unknown Link"
246
+
247
+ snippet_tag = result.find("a", class_="result__snippet")
248
+ summary = snippet_tag.text.strip() if snippet_tag else "No summary available."
249
+
250
+ article_content = extract_news_body(link)
251
+
252
+ bot = ChatBot()
253
+ rating = await bot.rate_body_of_article(title, article_content)
254
+
255
+ app_logger.log_info(f"Processed article: {title}", level="INFO")
256
+
257
+ return {
258
+ "num": index + 1,
259
+ "link": link,
260
+ "title": title,
261
+ "summary": summary,
262
+ "body": article_content,
263
+ "rating": rating
264
+ }
265
+
266
+ except Exception as e:
267
+ app_logger.log_error(f"Error processing article: {e}")
268
+ return None
269
+
270
+ tasks = [process_article(result, index) for index, result in enumerate(search_results[:num])]
271
+ extracted_results = await asyncio.gather(*tasks)
272
+
273
+ extracted_results = [res for res in extracted_results if res is not None]
274
+
275
+ if extracted_results:
276
+ app_logger.log_info(f"News search completed successfully with {len(extracted_results)} results", level="INFO")
277
+ return {"status": "success", "results": extracted_results}
278
+ else:
279
+ app_logger.log_error("No valid news search results found")
280
+ return {"status": "error", "message": "No valid news search results found"}
281
+
282
+
283
+ # ============================ UTILITY FUNCTIONS ============================
284
+
285
+ def current_year() -> int:
286
+ """Returns the current year as an integer."""
287
+ return datetime.now().year
288
+
289
+
290
+ def save_to_audio(text: str) -> None:
291
+ """Converts text to an audio file using Google Text-to-Speech (gTTS)."""
292
+ try:
293
+ tts = gTTS(text=text, lang="en")
294
+ tts.save("output.mp3")
295
+ app_logger.log_info("Response converted to audio", level="INFO")
296
+ except Exception as e:
297
+ app_logger.log_error(f"Error converting response to audio: {e}")
298
+