Spaces:

AliInamdar
/

SearchBot

Build error

App Files Files Community

AliInamdar commited on Mar 4, 2025

Commit

5f62365

verified ·

1 Parent(s): 27caad9

Upload helper.py

Browse files

Files changed (1) hide show

helper.py +298 -0

helper.py ADDED Viewed

	@@ -0,0 +1,298 @@

+# -*- coding: utf-8 -*-
+"""Helper.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1LWss_gahHvpiSsp7PsZRKTEsdRttjuAq
+"""
+import asyncio
+import json
+import os
+import subprocess
+import urllib
+from datetime import datetime
+from typing import Dict, List, Any, Optional
+import requests
+import re
+from bs4 import BeautifulSoup
+from gtts import gTTS
+#from logger.app_logger import app_logger
+# logger/app_logger.py
+import logging
+# Create a logger instance
+app_logger = logging.getLogger(__name__)
+# Set the logging level (e.g., DEBUG, INFO, WARNING, ERROR, CRITICAL)
+app_logger.setLevel(logging.DEBUG)
+# Create a handler (e.g., to write logs to a file or the console)
+handler = logging.StreamHandler() # Outputs logs to the console
+# Create a formatter to specify the log message format
+formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+handler.setFormatter(formatter)
+# Add the handler to the logger
+app_logger.addHandler(handler)
+# Now you can use the logger in your other modules
+# Example:
+# app_logger.info("This is an informational message.")
+!pip install gTTS
+import os
+    # Create the 'logger' directory if it doesn't exist
+if not os.path.exists('logger'):
+  os.makedirs('logger')
+    # Create an empty 'app_logger.py' file if it doesn't exist
+if not os.path.exists('logger/app_logger.py'):
+  with open('logger/app_logger.py', 'w') as f:
+    pass  # Leave the file empty for now
+class ChatBot:
+    """
+    A chatbot class that interacts with a local Llama model using Ollama.
+    """
+    def __init__(self) -> None:
+        """Initialize the ChatBot instance with a conversation history."""
+        self.history: List[Dict[str, str]] = [{"role": "system", "content": "You are a helpful assistant."}]
+        app_logger.log_info("ChatBot instance initialized", level="INFO")
+    def generate_response(self, prompt: str) -> str:
+        """
+        Generate a response from the chatbot based on the user's prompt.
+        Args:
+            prompt (str): The input message from the user.
+        Returns:
+            str: The chatbot's response to the provided prompt.
+        """
+        self.history.append({"role": "user", "content": prompt})
+        # app_logger.log_info(f"User prompt added to history: {prompt}", level="INFO")
+        app_logger.log_info("User prompt added to history", level="INFO")
+        # Convert chat history into a string for subprocess input
+        conversation = "\n".join(f"{msg['role']}: {msg['content']}" for msg in self.history)
+        try:
+            # Run the Llama model using Ollama
+            completion = subprocess.run(
+                ["ollama", "run", "llama3.2:latest"],
+                input=conversation,
+                capture_output=True,
+                text=True,
+            )
+            if completion.returncode != 0:
+                app_logger.log_error(f"Error running subprocess: {completion.stderr}")
+                return "I'm sorry, I encountered an issue processing your request."
+            response = completion.stdout.strip()
+            self.history.append({"role": "assistant", "content": response})
+            # app_logger.log_info(f"Assistant response generated: {response}", level="INFO")
+            app_logger.log_info("Assistant response generated", level="INFO")
+            return response
+        except Exception as e:
+            app_logger.log_error(f"Error sending query to the model: {e}")
+            return "I'm sorry, an error occurred while processing your request."
+    async def rate_body_of_article(self, article_title: str, article_content: str) -> str:
+        """
+        Rate the quality of an article's content based on its title.
+        Args:
+            article_title (str): The title of the article.
+            article_content (str): The full content of the article.
+        Returns:
+            str: A rating between 1 and 5 based on relevance and quality.
+        """
+        prompt = f"""
+        Given the following article title and content, provide a rating between 1 and 5
+        based on how well the content aligns with the title and its overall quality.
+        - **Article Title**: {article_title}
+        - **Article Content**: {article_content[:1000]}  # Limit to first 1000 chars
+        **Instructions:**
+        - The rating should be a whole number between 1 and 5.
+        - Base your score on accuracy, clarity, and relevance.
+        - Only return a single numeric value (1-5) with no extra text.
+        **Example Output:**
+        `4` or `2` or `3.5` or `1.5`
+        """
+        try:
+            # Run the Llama model using Ollama
+            completion = subprocess.run(
+                ["ollama", "run", "llama3.2:latest"],
+                input=prompt,
+                capture_output=True,
+                text=True,
+            )
+            if completion.returncode != 0:
+                app_logger.log_error(f"Error running subprocess: {completion.stderr}")
+                return "Error"
+            response = completion.stdout.strip()
+            # Validate the rating is within the expected range
+            if response.isdigit() and 1 <= int(response) <= 5:
+                self.history.append({"role": "assistant", "content": response})
+                app_logger.log_info(f"Article rated: {response}", level="INFO")
+                return response
+            else:
+                app_logger.log_warning(f"Invalid rating received: {response}")
+                return "Error"
+        except Exception as e:
+            app_logger.log_error(f"Error sending query to the model: {e}")
+            return "Error"
+# ============================ EXTRACT NEWS BODY ============================
+def extract_news_body(news_url: str) -> str:
+    """
+    Extract the full article body from a given news URL.
+    Args:
+        news_url (str): The URL of the news article.
+    Returns:
+        str: Extracted full article content.
+    """
+    try:
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
+        }
+        response = requests.get(news_url, headers=headers, timeout=5)
+        if response.status_code != 200:
+            app_logger.log_error(f"Failed to fetch article: {response.status_code}")
+            return "Failed to fetch article."
+        soup = BeautifulSoup(response.text, "html.parser")
+        paragraphs = soup.find_all("p")
+        # Extract and return cleaned text
+        article_content = "\n".join([p.text.strip() for p in paragraphs if p.text.strip()])
+        app_logger.log_info(f"Article content extracted from {news_url}", level="INFO")
+        return article_content
+    except Exception as e:
+        app_logger.log_error(f"Error extracting article content: {e}")
+        return f"Error extracting article content: {e}"
+# ============================ ASYNC NEWS SCRAPING ============================
+async def invoke_duckduckgo_news_search(query: str, num: int = 5, location: str = "us-en", time_filter: str = "w") -> \
+Dict[str, Any]:
+    """
+    Perform a DuckDuckGo News search, extract news headlines, fetch full content,
+    and rate articles using parallel asynchronous processing.
+    Args:
+        query (str): The search query string.
+        num (int): Number of search results to retrieve.
+        location (str): The region code for location-based results (e.g., 'us-en', 'in-en').
+        time_filter (str): Time filter for news ('d' = past day, 'w' = past week, 'm' = past month, 'y' = past year).
+    Returns:
+        Dict[str, Any]: A dictionary containing extracted news articles.
+    """
+    app_logger.log_info(f"Starting DuckDuckGo news search for query: {query}", level="INFO")
+    duckduckgo_news_url = f"https://duckduckgo.com/html/?q={query.replace(' ', '+')}&kl={location}&df={time_filter}&ia=news"
+    headers = {"User-Agent": "Mozilla/5.0"}
+    response = requests.get(duckduckgo_news_url, headers=headers)
+    if response.status_code != 200:
+        app_logger.log_error(f"Failed to fetch news search results: {response.status_code}")
+        return {"status": "error", "message": "Failed to fetch news search results"}
+    soup = BeautifulSoup(response.text, "html.parser")
+    search_results = soup.find_all("div", class_="result__body")
+    async def process_article(result, index: int) -> Optional[Dict[str, Any]]:
+        """Processes a single article: extracts details, fetches content, and rates it."""
+        try:
+            title_tag = result.find("a", class_="result__a")
+            if not title_tag:
+                app_logger.log_warning(f"Title tag not found for result index {index}")
+                return None
+            title = title_tag.text.strip()
+            raw_link = title_tag["href"]
+            match = re.search(r"uddg=(https?%3A%2F%2F[^&]+)", raw_link)
+            link = urllib.parse.unquote(match.group(1)) if match else "Unknown Link"
+            snippet_tag = result.find("a", class_="result__snippet")
+            summary = snippet_tag.text.strip() if snippet_tag else "No summary available."
+            article_content = extract_news_body(link)
+            bot = ChatBot()
+            rating = await bot.rate_body_of_article(title, article_content)
+            app_logger.log_info(f"Processed article: {title}", level="INFO")
+            return {
+                "num": index + 1,
+                "link": link,
+                "title": title,
+                "summary": summary,
+                "body": article_content,
+                "rating": rating
+            }
+        except Exception as e:
+            app_logger.log_error(f"Error processing article: {e}")
+            return None
+    tasks = [process_article(result, index) for index, result in enumerate(search_results[:num])]
+    extracted_results = await asyncio.gather(*tasks)
+    extracted_results = [res for res in extracted_results if res is not None]
+    if extracted_results:
+        app_logger.log_info(f"News search completed successfully with {len(extracted_results)} results", level="INFO")
+        return {"status": "success", "results": extracted_results}
+    else:
+        app_logger.log_error("No valid news search results found")
+        return {"status": "error", "message": "No valid news search results found"}
+# ============================ UTILITY FUNCTIONS ============================
+def current_year() -> int:
+    """Returns the current year as an integer."""
+    return datetime.now().year
+def save_to_audio(text: str) -> None:
+    """Converts text to an audio file using Google Text-to-Speech (gTTS)."""
+    try:
+        tts = gTTS(text=text, lang="en")
+        tts.save("output.mp3")
+        app_logger.log_info("Response converted to audio", level="INFO")
+    except Exception as e:
+        app_logger.log_error(f"Error converting response to audio: {e}")