Final_Assignment_Template

Sleeping

App Files Files Community

krim798 commited on Jun 30, 2025

Commit

1472a26

unverified ·

1 Parent(s): 085f488

hopefully works

Browse files

Files changed (4) hide show

app.py +235 -43
pyproject.toml +5 -0
requirements.txt +6 -0
uv.lock +0 -0

app.py CHANGED Viewed

@@ -9,7 +9,193 @@ from langchain_core.tools import tool
 from dotenv import load_dotenv
 import wikipedia
 from datetime import datetime
 @tool
 def current_datetime(_: str = "") -> str:
     """
@@ -73,57 +259,63 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
 class BasicAgent:
     def __init__(self):
         print("BasicAgent initialized.")
     def __call__(self, question: str) -> str:
         print(f"Agent received question (first 50 chars): {question[:50]}...")
-        # 1. Calculator logic
-        calc_keywords = ["calculate", "compute", "evaluate", "+", "-", "*", "/", "^", "sqrt", "log", "sum", "product"]
-        if any(kw in question.lower() for kw in calc_keywords):
-            try:
-                return calculator(question)
-            except Exception as e:
-                print(f"Calculator tool failed: {e}")
-        # 2. Date/time logic
-        datetime_keywords = ["date", "time", "day", "month", "year", "current time", "current date"]
-        if any(kw in question.lower() for kw in datetime_keywords):
-            try:
-                return current_datetime()
-            except Exception as e:
-                print(f"Datetime tool failed: {e}")
-        # 3. Wikipedia logic
-        if "wikipedia" in question.lower() or "wiki" in question.lower():
-            try:
-                # Remove "wikipedia" or "wiki" from the question for better search
-                cleaned = question.lower().replace("wikipedia", "").replace("wiki", "").strip()
-                return wikipedia_search(cleaned if cleaned else question)
-            except Exception as e:
-                print(f"Wikipedia tool failed: {e}")
-        # 4. Web search + scrape logic
-        try:
-            search_result = web_search(question)
-            # Try to extract a URL from the search result for scraping
-            import re
-            url_match = re.search(r"\((https?://[^\s)]+)\)", search_result)
-            if url_match:
-                url = url_match.group(1)
-                scraped = scraper(url)
-                # Combine search snippet and scraped content for a richer answer
-                return f"{search_result}\n\nScraped content:\n{scraped}"
             else:
-                return search_result
-        except Exception as e:
-            print(f"Web search/scraper tool failed: {e}")
-            return "Sorry, I couldn't find an answer."
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """

 from dotenv import load_dotenv
 import wikipedia
 from datetime import datetime
+from smolagents import Tool
+from docling.document_converter import DocumentConverter
+from docling.chunking import HierarchicalChunker
+from sentence_transformers import SentenceTransformer, util
+import torch
+class ContentRetrieverTool(Tool):
+    name = "retrieve_content"
+    description = """Retrieve the content of a webpage or document in markdown format. Supports PDF, DOCX, XLSX, HTML, images, and more."""
+    inputs = {
+        "url": {
+            "type": "string",
+            "description": "The URL or local path of the webpage or document to retrieve.",
+        },
+        "query": {
+            "type": "string",
+            "description": "The subject on the page you are looking for. The shorter the more relevant content is returned.",
+        },
+    }
+    output_type = "string"
+    def __init__(
+        self,
+        model_name: str | None = None,
+        threshold: float = 0.2,
+        **kwargs,
+    ):
+        self.threshold = threshold
+        self._document_converter = DocumentConverter()
+        self._model = SentenceTransformer(
+            model_name if model_name is not None else "all-MiniLM-L6-v2"
+        )
+        self._chunker = HierarchicalChunker()
+        super().__init__(**kwargs)
+    def forward(self, url: str, query: str) -> str:
+        document = self._document_converter.convert(url).document
+        chunks = list(self._chunker.chunk(dl_doc=document))
+        if len(chunks) == 0:
+            return "No content found."
+        chunks_text = [chunk.text for chunk in chunks]
+        chunks_with_context = [self._chunker.contextualize(chunk) for chunk in chunks]
+        chunks_context = [
+            chunks_with_context[i].replace(chunks_text[i], "").strip()
+            for i in range(len(chunks))
+        ]
+        chunk_embeddings = self._model.encode(chunks_text, convert_to_tensor=True)
+        context_embeddings = self._model.encode(chunks_context, convert_to_tensor=True)
+        query_embedding = self._model.encode(
+            [term.strip() for term in query.split(",") if term.strip()],
+            convert_to_tensor=True,
+        )
+        selected_indices = []  # aggregate indexes across chunks and context matches and for all queries
+        for embeddings in [
+            context_embeddings,
+            chunk_embeddings,
+        ]:
+            # Compute cosine similarities (returns 1D tensor)
+            for cos_scores in util.pytorch_cos_sim(query_embedding, embeddings):
+                # Convert to softmax probabilities
+                probabilities = torch.nn.functional.softmax(cos_scores, dim=0)
+                # Sort by probability descending
+                sorted_indices = torch.argsort(probabilities, descending=True)
+                # Accumulate until total probability reaches threshold
+                cumulative = 0.0
+                for i in sorted_indices:
+                    cumulative += probabilities[i].item()
+                    selected_indices.append(i.item())
+                    if cumulative >= self.threshold:
+                        break
+        selected_indices = list(
+            dict.fromkeys(selected_indices)
+        )  # remove duplicates and preserve order
+        selected_indices = selected_indices[
+            ::-1
+        ]  # make most relevant items last for better focus
+        if len(selected_indices) == 0:
+            return "No content found."
+        return "\n\n".join([chunks_with_context[idx] for idx in selected_indices])
+from smolagents import Tool
+from googleapiclient.discovery import build
+import os
+class GoogleSearchTool(Tool):
+    name = "web_search"
+    description = """Performs a google web search for query then returns top search results in markdown format."""
+    inputs = {
+        "query": {
+            "type": "string",
+            "description": "The query to perform search.",
+        },
+    }
+    output_type = "string"
+    skip_forward_signature_validation = True
+    def __init__(
+        self,
+        api_key: str | None = None,
+        search_engine_id: str | None = None,
+        num_results: int = 10,
+        **kwargs,
+    ):
+        api_key = api_key if api_key is not None else os.getenv("GOOGLE_SEARCH_API_KEY")
+        if not api_key:
+            raise ValueError(
+                "Please set the GOOGLE_SEARCH_API_KEY environment variable."
+            )
+        search_engine_id = (
+            search_engine_id
+            if search_engine_id is not None
+            else os.getenv("GOOGLE_SEARCH_ENGINE_ID")
+        )
+        if not search_engine_id:
+            raise ValueError(
+                "Please set the GOOGLE_SEARCH_ENGINE_ID environment variable."
+            )
+        self.cse = build("customsearch", "v1", developerKey=api_key).cse()
+        self.cx = search_engine_id
+        self.num = num_results
+        super().__init__(**kwargs)
+    def _collect_params(self) -> dict:
+        return {}
+    def forward(self, query: str, *args, **kwargs) -> str:
+        params = {
+            "q": query,
+            "cx": self.cx,
+            "fields": "items(title,link,snippet)",
+            "num": self.num,
+        }
+        params = params | self._collect_params(*args, **kwargs)
+        response = self.cse.list(**params).execute()
+        if "items" not in response:
+            return "No results found."
+        result = "\n\n".join(
+            [
+                f"[{item['title']}]({item['link']})\n{item['snippet']}"
+                for item in response["items"]
+            ]
+        )
+        return result
+class GoogleSiteSearchTool(GoogleSearchTool):
+    name = "site_search"
+    description = """Performs a google search within the website for query then returns top search results in markdown format."""
+    inputs = {
+        "query": {
+            "type": "string",
+            "description": "The query to perform search.",
+        },
+        "site": {
+            "type": "string",
+            "description": "The domain of the site on which to search.",
+        },
+    }
+    def _collect_params(self, site: str) -> dict:
+        return {
+            "siteSearch": site,
+            "siteSearchFilter": "i",
+        }
+def get_one_word_answer(text: str) -> str:
+    # Try to extract a single word (alphanumeric) from the response
+    import re
+    words = re.findall(r'\b\w+\b', text)
+    return words[0] if words else text.strip()
 @tool
 def current_datetime(_: str = "") -> str:
     """
 # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
+# ...existing code...
+TOOL_REGISTRY = {
+    "calculator": calculator,
+    "current_datetime": current_datetime,
+    "wikipedia_search": wikipedia_search,
+    "scraper": scraper,
+    "web_search": web_search,
+    "site_search": GoogleSiteSearchTool().forward,
+}
+def select_tool(question: str):
+    import re
+    # Tool selection logic (can be replaced by LLM prompt in advanced setups)
+    if any(kw in question.lower() for kw in ["calculate", "compute", "evaluate", "+", "-", "*", "/", "^", "sqrt", "log", "sum", "product"]):
+        return "calculator"
+    if any(kw in question.lower() for kw in ["date", "time", "day", "month", "year", "current time", "current date"]):
+        return "current_datetime"
+    if "wikipedia" in question.lower() or "wiki" in question.lower():
+        return "wikipedia_search"
+    # Add more rules as needed
+    return "web_search"
 class BasicAgent:
     def __init__(self):
         print("BasicAgent initialized.")
     def __call__(self, question: str) -> str:
         print(f"Agent received question (first 50 chars): {question[:50]}...")
+        import re
+        tool_name = select_tool(question)
+        tool = TOOL_REGISTRY.get(tool_name, web_search)
+        # For other tools, pass the question or relevant part
+        if tool_name == "wikipedia_search":
+            cleaned = question.lower().replace("wikipedia", "").replace("wiki", "").strip()
+            return get_one_word_answer(tool(cleaned if cleaned else question))
+        if tool_name == "calculator":
+            return get_one_word_answer(tool(question))
+        if tool_name == "current_datetime":
+            return get_one_word_answer(tool())
+        if tool_name == "scraper":
+            return get_one_word_answer(tool(question))
+        if tool_name == "site_search":
+            # Example: expects "site:example.com query"
+            parts = question.split("site:")
+            if len(parts) == 2:
+                site = parts[1].split()[0]
+                query = parts[1][len(site):].strip()
+                return get_one_word_answer(tool(query, site))
             else:
+                return "No site specified."
+        # Default: web_search
+        result = tool(question)
+        return get_one_word_answer(result)
+# ...existing code...
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """

pyproject.toml CHANGED Viewed

@@ -5,13 +5,18 @@ description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [
     "dotenv>=0.9.9",
     "firecrawl-py>=2.12.0",
     "gradio>=5.35.0",
     "huggingface-hub>=0.33.1",
     "langchain>=0.3.26",
     "langchain-community>=0.3.26",
     "requests>=2.32.4",
     "ruff>=0.12.1",
     "wikipedia>=1.4.0",
 ]

 readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [
+    "docling>=2.39.0",
     "dotenv>=0.9.9",
     "firecrawl-py>=2.12.0",
+    "google>=3.0.0",
     "gradio>=5.35.0",
     "huggingface-hub>=0.33.1",
     "langchain>=0.3.26",
     "langchain-community>=0.3.26",
     "requests>=2.32.4",
     "ruff>=0.12.1",
+    "sentence-transformers>=4.1.0",
+    "smolagents>=1.19.0",
+    "torch>=2.7.1",
     "wikipedia>=1.4.0",
 ]

requirements.txt CHANGED Viewed

@@ -5,3 +5,9 @@ dotenv
 langchain_community
 firecrawl_py
 wikipedia

 langchain_community
 firecrawl_py
 wikipedia
+torch
+transformers
+sentence_transformers
+docling
+smolagents
+google

uv.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff