Spaces:

gabejavitt
/

agentCourse

Sleeping

App Files Files Community

gabejavitt commited on Nov 2, 2025

Commit

8c577e8

verified ·

1 Parent(s): a61ff20

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -62

app.py CHANGED Viewed

@@ -34,6 +34,7 @@ from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.tools import DuckDuckGoSearchRun
 # =============================================================================
 # CONFIGURATION
@@ -42,6 +43,40 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 MAX_TURNS = 20
 MAX_MESSAGE_LENGTH = 8000
 # =============================================================================
 # ASR INITIALIZATION
 # =============================================================================
@@ -343,57 +378,96 @@ class ScrapeInput(BaseModel):
 @tool(args_schema=ScrapeInput)
 def scrape_and_retrieve(url: str, query: str) -> str:
     """
-    Scrapes a webpage, chunks its content, and performs RAG search.
     """
     if not (url.lower().startswith(('http://', 'https://'))):
         return f"Error: Invalid URL. Must start with http:// or https://. Got: '{url}'"
-    if not query:
         return "Error: A query is required to search the page content."
-    # Access global agent for RAG components
-    if not hasattr(scrape_and_retrieve, 'embeddings') or not hasattr(scrape_and_retrieve, 'text_splitter'):
-        return "Error: RAG components are not initialized."
-    print(f"--- Calling RAG Scraper: {url} for query: {query} ---")
     try:
         headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
         }
         response = requests.get(url, headers=headers, timeout=20)
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
-        for tag in soup(["script", "style", "nav", "footer", "aside", "header"]):
             tag.extract()
-        main_content = soup.find('main') or soup.find('article') or soup.body
         if not main_content:
             return "Error: Could not find main content on the page."
         text = main_content.get_text(separator='\n', strip=True)
-        text = '\n'.join(chunk for chunk in (line.strip() for line in text.splitlines()) if chunk)
-        if not text:
-            return "Error: Scraped content was empty."
-        docs = scrape_and_retrieve.text_splitter.create_documents([text])
-        if not docs:
-            return "Error: Text could not be split into documents."
-        db = FAISS.from_documents(docs, scrape_and_retrieve.embeddings)
         retriever = db.as_retriever(search_kwargs={"k": 5})
         retrieved_docs = retriever.invoke(query)
         if not retrieved_docs:
-            return "Error: No relevant information found on the page for that query."
-        context = "\n\n---\n\n".join([doc.page_content for doc in retrieved_docs])
-        return f"Relevant Context from {url} for query '{query}':\n\n{context}"
     except Exception as e:
         tb_str = traceback.format_exc()
-        return f"Error scraping or retrieving from {url}: {str(e)}\n{tb_str}"
 class FinalAnswerInput(BaseModel):
@@ -423,7 +497,7 @@ def parse_tool_call_from_string(content: str, tools: List) -> List[ToolCall]:
     """
     Parses malformed tool call strings from an LLM response.
     """
-    print(f"Original LLM content for fallback parsing:\n---\n{content}\n---")
     tool_name = None
     tool_input = None
     cleaned_str = None
@@ -513,7 +587,7 @@ class AgentState(TypedDict):
 # =============================================================================
 # CONDITIONAL EDGE FUNCTION
-#=============================================================================
 def should_continue(state: AgentState):
     """
     Decide whether to continue, call tools, or end.
@@ -562,26 +636,8 @@ class BasicAgent:
         self.tools = defined_tools
         # Initialize RAG Components
-        print("Initializing RAG components...")
-        try:
-            self.embeddings = HuggingFaceEmbeddings(
-                model_name="sentence-transformers/all-MiniLM-L6-v2",
-                model_kwargs={'device': 'cpu'}
-            )
-            self.text_splitter = RecursiveCharacterTextSplitter(
-                chunk_size=1000,
-                chunk_overlap=200
-            )
-            # Attach to scraper tool
-            scrape_and_retrieve.embeddings = self.embeddings
-            scrape_and_retrieve.text_splitter = self.text_splitter
-            print("✅ RAG components initialized.")
-        except Exception as e:
-            print(f"⚠️ Warning: Could not initialize RAG components. Error: {e}")
-            self.embeddings = None
-            self.text_splitter = None
         # Build tool descriptions
         tool_desc_list = []
@@ -613,35 +669,24 @@ Your goal: Provide the EXACT answer in the EXACT format requested.
 **CRITICAL RULES:**
 - **TOOL USE:** You MUST use tools to find the answer. Do NOT use your own knowledge.
 - **FINAL ANSWER:** When you have the answer, use final_answer_tool. The 'answer' argument must be the answer ONLY (e.g., "42", "red, blue, green").
-- **JSON FORMAT:** All tool calls MUST be in this exact JSON format:
-  {{"name": "tool_name", "arguments": {{"key": "value"}}}}
-**EXAMPLE: CODE INTERPRETER**
-{{"name": "code_interpreter", "arguments": {{"code": "print(1 + 1)"}}}}
-**EXAMPLE: FINAL ANSWER**
-{{"name": "final_answer_tool", "arguments": {{"answer": "28"}}}}
 **TOOLS:**
 {tool_descriptions}
-**REMEMBER:** One step at a time. Use tools. Format JSON correctly.
 """
         print("Initializing Groq LLM...")
         try:
             self.llm_with_tools = ChatGroq(
                 temperature=0,
                 groq_api_key=GROQ_API_KEY,
-                model_name="openai/gpt-oss-120b",
                 max_tokens=4096,
                 timeout=60
-            ).bind_tools(
-    self.tools,
-    # This setting forces the model to call one of the bound tools.
-    # 'auto' is the default, but 'any' is stricter for an agent.
-    tool_choice="any"
-            )
             print("✅ Main LLM (llama-3.3-70b-versatile with tools) initialized.")
         except Exception as e:
@@ -656,7 +701,7 @@ Your goal: Provide the EXACT answer in the EXACT format requested.
             print('='*60)
             if current_turn > MAX_TURNS:
-                return {"messages": [SystemMessage(content="Max turns reached.")]}
             max_retries = 3
             ai_message = None
@@ -691,7 +736,7 @@ Your goal: Provide the EXACT answer in the EXACT format requested.
         # Tool Node
         tool_node = ToolNode(self.tools)
         # Build Graph
         print("Building Single-Agent graph...")
         graph_builder = StateGraph(AgentState)

 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.tools import DuckDuckGoSearchRun
+from langchain.docstore.document import Document
 # =============================================================================
 # CONFIGURATION
 MAX_TURNS = 20
 MAX_MESSAGE_LENGTH = 8000
+# =============================================================================
+# GLOBAL RAG COMPONENTS (Initialize once)
+# =============================================================================
+global_embeddings = None
+global_text_splitter = None
+def initialize_rag_components():
+    """Initialize RAG components globally."""
+    global global_embeddings, global_text_splitter
+    if global_embeddings is None:
+        print("Initializing RAG embeddings...")
+        try:
+            global_embeddings = HuggingFaceEmbeddings(
+                model_name="sentence-transformers/all-MiniLM-L6-v2",
+                model_kwargs={'device': 'cpu'}
+            )
+            print("✅ Embeddings initialized.")
+        except Exception as e:
+            print(f"⚠️ Failed to initialize embeddings: {e}")
+            return False
+    if global_text_splitter is None:
+        print("Initializing text splitter...")
+        global_text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=200,
+            length_function=len,
+            separators=["\n\n", "\n", ". ", " ", ""]
+        )
+        print("✅ Text splitter initialized.")
+    return True
 # =============================================================================
 # ASR INITIALIZATION
 # =============================================================================
 @tool(args_schema=ScrapeInput)
 def scrape_and_retrieve(url: str, query: str) -> str:
     """
+    Scrapes a webpage, embeds its content using RAG, and retrieves relevant sections based on the query.
+    Use this to extract specific information from web pages.
     """
     if not (url.lower().startswith(('http://', 'https://'))):
         return f"Error: Invalid URL. Must start with http:// or https://. Got: '{url}'"
+    if not query or not query.strip():
         return "Error: A query is required to search the page content."
+    # Check if RAG components are initialized
+    if global_embeddings is None or global_text_splitter is None:
+        if not initialize_rag_components():
+            return "Error: RAG components could not be initialized."
+    print(f"--- Calling RAG Scraper: {url} for query: '{query}' ---")
     try:
+        # Fetch the webpage
         headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
         }
+        print(f"Fetching URL: {url}")
         response = requests.get(url, headers=headers, timeout=20)
         response.raise_for_status()
+        # Parse HTML
         soup = BeautifulSoup(response.text, 'html.parser')
+        # Remove unwanted tags
+        for tag in soup(["script", "style", "nav", "footer", "aside", "header", "iframe", "noscript"]):
             tag.extract()
+        # Try to find main content
+        main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile('content|main|article', re.I)) or soup.body
         if not main_content:
             return "Error: Could not find main content on the page."
+        # Extract text
         text = main_content.get_text(separator='\n', strip=True)
+        # Clean up text - remove extra whitespace and empty lines
+        lines = [line.strip() for line in text.splitlines()]
+        text = '\n'.join(line for line in lines if line)
+        if not text or len(text) < 50:
+            return f"Error: Scraped content was too short or empty (length: {len(text)})."
+        print(f"Scraped text length: {len(text)} characters")
+        # Split text into chunks
+        chunks = global_text_splitter.split_text(text)
+        if not chunks:
+            return "Error: Text could not be split into chunks."
+        print(f"Created {len(chunks)} chunks")
+        # Create Document objects
+        docs = [Document(page_content=chunk, metadata={"source": url}) for chunk in chunks]
+        # Create FAISS vector store
+        print("Creating embeddings and vector store...")
+        db = FAISS.from_documents(docs, global_embeddings)
+        # Retrieve relevant chunks
+        print(f"Searching for: '{query}'")
         retriever = db.as_retriever(search_kwargs={"k": 5})
         retrieved_docs = retriever.invoke(query)
         if not retrieved_docs:
+            return f"No relevant information found on {url} for query: '{query}'\n\nThe page was successfully scraped but doesn't seem to contain information matching your query."
+        print(f"Retrieved {len(retrieved_docs)} relevant chunks")
+        # Combine retrieved chunks
+        context_parts = []
+        for i, doc in enumerate(retrieved_docs, 1):
+            context_parts.append(f"[Chunk {i}]\n{doc.page_content}")
+        context = "\n\n---\n\n".join(context_parts)
+        result = f"Successfully retrieved relevant information from {url}\n\nQuery: {query}\n\n{context}"
+        return truncate_if_needed(result)
+    except requests.RequestException as e:
+        return f"Error fetching URL {url}: {str(e)}\n\nThe website may be blocking requests or may be temporarily unavailable."
     except Exception as e:
         tb_str = traceback.format_exc()
+        return f"Error processing {url}: {str(e)}\n\nDetails:\n{tb_str}"
 class FinalAnswerInput(BaseModel):
     """
     Parses malformed tool call strings from an LLM response.
     """
+    print(f"Original LLM content for fallback parsing:\n---\n{content[:500]}\n---")
     tool_name = None
     tool_input = None
     cleaned_str = None
 # =============================================================================
 # CONDITIONAL EDGE FUNCTION
+# =============================================================================
 def should_continue(state: AgentState):
     """
     Decide whether to continue, call tools, or end.
         self.tools = defined_tools
         # Initialize RAG Components
+        if not initialize_rag_components():
+            print("⚠️ Warning: RAG components failed to initialize. scrape_and_retrieve may not work.")
         # Build tool descriptions
         tool_desc_list = []
 **CRITICAL RULES:**
 - **TOOL USE:** You MUST use tools to find the answer. Do NOT use your own knowledge.
 - **FINAL ANSWER:** When you have the answer, use final_answer_tool. The 'answer' argument must be the answer ONLY (e.g., "42", "red, blue, green").
+- **NO CONVERSATIONAL TEXT:** Never add phrases like "The answer is" or "Based on the information". Just the answer.
 **TOOLS:**
 {tool_descriptions}
+**REMEMBER:** One step at a time. Use tools. Call final_answer_tool when done.
 """
         print("Initializing Groq LLM...")
         try:
+            # Changed from tool_choice="any" to "auto" for better flexibility
             self.llm_with_tools = ChatGroq(
                 temperature=0,
                 groq_api_key=GROQ_API_KEY,
+                model_name="llama-3.3-70b-versatile",
                 max_tokens=4096,
                 timeout=60
+            ).bind_tools(self.tools, tool_choice="auto")
             print("✅ Main LLM (llama-3.3-70b-versatile with tools) initialized.")
         except Exception as e:
             print('='*60)
             if current_turn > MAX_TURNS:
+                return {"messages": [SystemMessage(content="Max turns reached.")], "turn": current_turn}
             max_retries = 3
             ai_message = None
         # Tool Node
         tool_node = ToolNode(self.tools)
         # Build Graph
         print("Building Single-Agent graph...")
         graph_builder = StateGraph(AgentState)