Spaces:

mafzaal
/

AIE6-ResearchAgent

Sleeping

mafzaal commited on Apr 20, 2025

Commit

2dad3d9

1 Parent(s): fbc22d4

Implement LangGraph Agent for Research with Document Retrieval and Search Tools

- Added `agent.py` to define the Research Agent's state and processing logic.
- Implemented message handling, context retrieval from documents, and model invocation.
- Created a document search tool to query uploaded documents.
- Developed a function to convert user inputs into the agent's expected format.
- Introduced a search tools module in `search_tools.py` to integrate Tavily, DuckDuckGo, and Arxiv search functionalities.
- Established a comprehensive agent chain that includes retrieval, processing, and tool execution.

Files changed (7) hide show

handlers/chainlit_handlers.py +18 -273
models/agent.py +269 -0
models/research_tools.py +13 -133
models/search_tools.py +28 -0
pyproject.toml +3 -0
utils/file_processor.py +69 -22
uv.lock +204 -0

handlers/chainlit_handlers.py CHANGED Viewed

@@ -7,276 +7,16 @@ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
 from langchain_qdrant import QdrantVectorStore
 from qdrant_client import QdrantClient
 from qdrant_client.models import Distance, VectorParams
-from langchain import hub
-from langchain.agents import create_openai_functions_agent, AgentExecutor
-# Update memory import to use the newer approach
-from langchain_core.runnables.history import RunnableWithMessageHistory
-from langchain_core.chat_history import BaseChatMessageHistory
-from langchain_core.prompts import MessagesPlaceholder
 from utils.file_processor import process_file
 from models.rag import LangChainRAG
-from models.research_tools import ResearchToolkit, RAGQueryInput
 import config
-from langchain_community.tools.tavily_search import TavilySearchResults
-from langchain_community.tools.arxiv.tool import ArxivQueryRun
-from typing import TypedDict, Annotated, Dict, Any, Literal, Union, cast, List, Optional
-from langgraph.graph.message import add_messages
-import operator
-from langchain_core.messages import BaseMessage, SystemMessage
-from langgraph.graph import StateGraph, END
-from langchain_core.messages import HumanMessage
-from langchain_community.tools import DuckDuckGoSearchResults
-from langchain_core.documents import Document
-from langchain_core.tools import Tool
-tavily_tool = TavilySearchResults(max_results=5)
-duckduckgo_tool = DuckDuckGoSearchResults(max_results=5)
-arxiv_tool = ArxivQueryRun()
-tool_belt = [
-    tavily_tool,
-    duckduckgo_tool,
-    arxiv_tool,
-]
-model = ChatOpenAI(model="gpt-4o", temperature=0)
-model = model.bind_tools(tool_belt)
-class ResearchAgentState(TypedDict):
-    """
-    State definition for the Research Agent using LangGraph.
-    Attributes:
-        messages: List of messages in the conversation
-        context: Additional context information from RAG retrievals
-        documents: Optional list of Document objects from uploaded files
-    """
-    messages: Annotated[list[BaseMessage], add_messages]
-    context: str
-    documents: Optional[List[Document]]
-from langgraph.prebuilt import ToolNode
-def call_model(state: Dict[str, Any]) -> Dict[str, list[BaseMessage]]:
-    """
-    Process the current state through the language model.
-    Args:
-        state: Current state containing messages and context
-    Returns:
-        Updated state with model's response added to messages
-    """
-    try:
-        messages = state["messages"]
-        context = state.get("context", "")
-        # Add context from documents if available
-        if context:
-            # Insert system message with context before the latest user message
-            context_message = SystemMessage(content=f"Use the following information from uploaded documents to enhance your response if relevant:\n\n{context}")
-            # Find the position of the last user message
-            for i in range(len(messages)-1, -1, -1):
-                if isinstance(messages[i], HumanMessage):
-                    # Insert context right after the last user message
-                    enhanced_messages = messages[:i+1] + [context_message] + messages[i+1:]
-                    break
-            else:
-                # No user message found, just append context
-                enhanced_messages = messages + [context_message]
-        else:
-            enhanced_messages = messages
-        # Get response from the model
-        response = model.invoke(enhanced_messages)
-        return {"messages": [response]}
-    except Exception as e:
-        # Handle exceptions gracefully
-        error_msg = f"Error calling model: {str(e)}"
-        print(error_msg)  # Log the error
-        # Return a fallback response
-        return {"messages": [HumanMessage(content=error_msg)]}
-def should_continue(state: Dict[str, Any]) -> Union[Literal["action"], Literal[END]]:
-    """
-    Determine if the agent should continue processing or end.
-    Args:
-        state: Current state containing messages and context
-    Returns:
-        "action" if tool calls are present, otherwise END
-    """
-    last_message = state["messages"][-1]
-    if last_message.tool_calls:
-        return "action"
-    return END
-def convert_inputs(input_object: Dict[str, str]) -> Dict[str, list[BaseMessage]]:
-    """
-    Convert user input into the format expected by the agent.
-    Args:
-        input_object: Dictionary containing the user's question
-    Returns:
-        Formatted input state for the agent
-    """
-    return {"messages": [HumanMessage(content=input_object["question"])]}
-def parse_output(input_state: Dict[str, Any]) -> str:
-    """
-    Extract the final response from the agent's state.
-    Args:
-        input_state: The final state of the agent
-    Returns:
-        The content of the last message
-    """
-    try:
-        return cast(str, input_state["messages"][-1].content)
-    except (IndexError, KeyError, AttributeError) as e:
-        # Handle potential errors when accessing the output
-        error_msg = f"Error parsing output: {str(e)}"
-        print(error_msg)  # Log the error
-        return "I encountered an error while processing your request."
-def build_agent_chain() -> Any:
-    """
-    Constructs and returns the research agent execution chain.
-    The chain consists of:
-    1. A retrieval node that gets context from documents
-    2. An agent node that processes messages
-    3. A tool node that executes tools when called
-    Returns:
-        Compiled agent chain ready for execution
-    """
-    # Create document search tool
-    doc_search_tool = Tool(
-        name="DocumentSearch",
-        description="Search within the user's uploaded document. Use this tool when you need information from the specific document that was uploaded.",
-        func=document_search_tool,
-        args_schema=RAGQueryInput
-    )
-    # Add document search tool to the tool belt if we have upload capability
-    tools = tool_belt.copy()
-    tools.append(doc_search_tool)
-    # Create a node for tool execution
-    tool_node = ToolNode(tools)
-    # Initialize the graph with our state type
-    uncompiled_graph = StateGraph(ResearchAgentState)
-    # Add nodes
-    uncompiled_graph.add_node("retrieve", retrieve_from_documents)
-    uncompiled_graph.add_node("agent", call_model)
-    uncompiled_graph.add_node("action", tool_node)
-    # Set the entry point to retrieve context first
-    uncompiled_graph.set_entry_point("retrieve")
-    # Add edges
-    uncompiled_graph.add_edge("retrieve", "agent")
-    # Add conditional edges from agent
-    uncompiled_graph.add_conditional_edges(
-        "agent",
-        should_continue,
-        {
-            "action": "action",
-            END: END
-        }
-    )
-    # Complete the loop
-    uncompiled_graph.add_edge("action", "agent")
-    # Compile the graph
-    compiled_graph = uncompiled_graph.compile()
-    # Create the full chain
-    agent_chain = convert_inputs | compiled_graph
-    return agent_chain
-def retrieve_from_documents(state: Dict[str, Any]) -> Dict[str, str]:
-    """
-    Retrieve relevant context from uploaded documents based on the user query.
-    Args:
-        state: Current state containing messages and optional documents
-    Returns:
-        Updated state with context from document retrieval
-    """
-    # Get the last user message
-    for message in reversed(state["messages"]):
-        if isinstance(message, HumanMessage):
-            query = message.content
-            break
-    else:
-        # No user message found
-        return {"context": ""}
-    # Skip if no documents are uploaded
-    retriever = cl.user_session.get("retriever")
-    if not retriever:
-        return {"context": ""}
-    try:
-        # Retrieve relevant documents
-        docs = retriever.invoke(query)
-        if not docs:
-            return {"context": ""}
-        # Extract text from documents
-        context = "\n\n".join([f"Document excerpt: {doc.page_content}" for doc in docs])
-        return {"context": context}
-    except Exception as e:
-        print(f"Error retrieving from documents: {str(e)}")
-        return {"context": ""}
-def document_search_tool(query: str) -> str:
-    """
-    Tool function to search within uploaded documents.
-    Args:
-        query: Search query string
-    Returns:
-        Information retrieved from the documents
-    """
-    retriever = cl.user_session.get("retriever")
-    if not retriever:
-        return "No documents have been uploaded yet. Please upload a document first."
-    docs = retriever.invoke(query)
-    if not docs:
-        return "No relevant information found in the uploaded documents."
-    # Format the results
-    results = []
-    for i, doc in enumerate(docs):
-        results.append(f"[Document {i+1}] {doc.page_content}")
-    return "\n\n".join(results)
 @cl.on_chat_start
 async def on_chat_start():
@@ -289,8 +29,11 @@ async def on_chat_start():
         content="Welcome to the Research Agent! I can help you research topics using web search, arXiv papers, and documents you upload."
     ).send()
     # Create the agent
-    agent = build_agent_chain()
     # Store agent in user session
     cl.user_session.set("agent", agent)
@@ -328,7 +71,7 @@ async def main(message):
     with cl.Step(name="Research Process", type="tool") as step:
         # Run the agent executor with callbacks to stream the response
         result = await agent_executor.ainvoke(
-            {"question" : message.content},
             config={
                 "callbacks": [cl.AsyncLangchainCallbackHandler()],
                 "configurable": {"session_id": message.id}  # Add session_id from message
@@ -348,10 +91,9 @@ async def main(message):
                 ).send()
     # Get the final answer
-    final_answer = parse_output(result) #result["messages"][-1].content
-    # Fix: Replace cl.make_async_gen with proper token streaming in Chainlit 2.0.4
-    # Instead of using make_async_gen, we'll manually stream tokens from the final_answer
     await msg.stream_token(final_answer)
     await msg.send()
@@ -407,8 +149,11 @@ async def process_uploaded_file(file: cl.File, msg: cl.Message):
         # Store the retriever in the user session
         cl.user_session.set("retriever", retriever)
-        # Rebuild the agent chain with updated tools
-        agent = build_agent_chain()
         cl.user_session.set("agent", agent)
         # Let the user know that the file is processed

 from langchain_qdrant import QdrantVectorStore
 from qdrant_client import QdrantClient
 from qdrant_client.models import Distance, VectorParams
+from langchain_core.tools import Tool
+from typing import Dict, Any, List, Optional
+from langchain_core.documents import Document
 from utils.file_processor import process_file
 from models.rag import LangChainRAG
+from models.research_tools import RAGQueryInput
+from models.search_tools import create_search_tools
+from models.agent import build_agent_chain, parse_output
 import config
 @cl.on_chat_start
 async def on_chat_start():
         content="Welcome to the Research Agent! I can help you research topics using web search, arXiv papers, and documents you upload."
     ).send()
+    # Create search tools
+    tools = create_search_tools(max_results=config.MAX_TAVILY_SEARCH_RESULTS)
     # Create the agent
+    agent = build_agent_chain(tools)
     # Store agent in user session
     cl.user_session.set("agent", agent)
     with cl.Step(name="Research Process", type="tool") as step:
         # Run the agent executor with callbacks to stream the response
         result = await agent_executor.ainvoke(
+            {"question": message.content},
             config={
                 "callbacks": [cl.AsyncLangchainCallbackHandler()],
                 "configurable": {"session_id": message.id}  # Add session_id from message
                 ).send()
     # Get the final answer
+    final_answer = parse_output(result)
+    # Stream tokens from the final_answer
     await msg.stream_token(final_answer)
     await msg.send()
         # Store the retriever in the user session
         cl.user_session.set("retriever", retriever)
+        # Get the search tools
+        tools = create_search_tools(max_results=config.MAX_TAVILY_SEARCH_RESULTS)
+        # Rebuild the agent with the retriever
+        agent = build_agent_chain(tools, retriever)
         cl.user_session.set("agent", agent)
         # Let the user know that the file is processed

models/agent.py ADDED Viewed

	@@ -0,0 +1,269 @@

+"""
+LangGraph Agent implementation for the Research Agent.
+"""
+from typing import TypedDict, Annotated, Dict, Any, Literal, Union, cast, List, Optional
+from langchain_openai import ChatOpenAI
+from langchain_core.tools import Tool
+from langchain_core.messages import BaseMessage, SystemMessage, HumanMessage
+from langchain_core.documents import Document
+from langgraph.graph.message import add_messages
+from langgraph.graph import StateGraph, END
+from langgraph.prebuilt import ToolNode
+from models.research_tools import RAGQueryInput
+# Define END as a string constant since we can't use it directly in type annotations
+END_STATE = "end"
+class ResearchAgentState(TypedDict):
+    """
+    State definition for the Research Agent using LangGraph.
+    Attributes:
+        messages: List of messages in the conversation
+        context: Additional context information from RAG retrievals
+        documents: Optional list of Document objects from uploaded files
+    """
+    messages: Annotated[list[BaseMessage], add_messages]
+    context: str
+    documents: Optional[List[Document]]
+def call_model(model, state: Dict[str, Any]) -> Dict[str, list[BaseMessage]]:
+    """
+    Process the current state through the language model.
+    Args:
+        model: Language model with tools bound
+        state: Current state containing messages and context
+    Returns:
+        Updated state with model's response added to messages
+    """
+    try:
+        messages = state["messages"]
+        context = state.get("context", "")
+        # Add context from documents if available
+        if context:
+            # Insert system message with context before the latest user message
+            context_message = SystemMessage(content=f"Use the following information from uploaded documents to enhance your response if relevant:\n\n{context}")
+            # Find the position of the last user message
+            for i in range(len(messages)-1, -1, -1):
+                if isinstance(messages[i], HumanMessage):
+                    # Insert context right after the last user message
+                    enhanced_messages = messages[:i+1] + [context_message] + messages[i+1:]
+                    break
+            else:
+                # No user message found, just append context
+                enhanced_messages = messages + [context_message]
+        else:
+            enhanced_messages = messages
+        # Get response from the model
+        response = model.invoke(enhanced_messages)
+        return {"messages": [response]}
+    except Exception as e:
+        # Handle exceptions gracefully
+        error_msg = f"Error calling model: {str(e)}"
+        print(error_msg)  # Log the error
+        # Return a fallback response
+        return {"messages": [HumanMessage(content=error_msg)]}
+def should_continue(state: Dict[str, Any]) -> Union[Literal["action"], Literal["end"]]:
+    """
+    Determine if the agent should continue processing or end.
+    Args:
+        state: Current state containing messages and context
+    Returns:
+        "action" if tool calls are present, otherwise "end"
+    """
+    last_message = state["messages"][-1]
+    if last_message.tool_calls:
+        return "action"
+    return "end"
+def retrieve_from_documents(state: Dict[str, Any], retriever) -> Dict[str, str]:
+    """
+    Retrieve relevant context from uploaded documents based on the user query.
+    Args:
+        state: Current state containing messages and optional documents
+        retriever: Document retriever to use
+    Returns:
+        Updated state with context from document retrieval
+    """
+    # Get the last user message
+    for message in reversed(state["messages"]):
+        if isinstance(message, HumanMessage):
+            query = message.content
+            break
+    else:
+        # No user message found
+        return {"context": ""}
+    # Skip if no documents are uploaded
+    if not retriever:
+        return {"context": ""}
+    try:
+        # Retrieve relevant documents
+        docs = retriever.invoke(query)
+        if not docs:
+            return {"context": ""}
+        # Extract text from documents
+        context = "\n\n".join([f"Document excerpt: {doc.page_content}" for doc in docs])
+        return {"context": context}
+    except Exception as e:
+        print(f"Error retrieving from documents: {str(e)}")
+        return {"context": ""}
+def document_search_tool(retriever, query: str) -> str:
+    """
+    Tool function to search within uploaded documents.
+    Args:
+        retriever: Document retriever to use
+        query: Search query string
+    Returns:
+        Information retrieved from the documents
+    """
+    if not retriever:
+        return "No documents have been uploaded yet. Please upload a document first."
+    docs = retriever.invoke(query)
+    if not docs:
+        return "No relevant information found in the uploaded documents."
+    # Format the results
+    results = []
+    for i, doc in enumerate(docs):
+        results.append(f"[Document {i+1}] {doc.page_content}")
+    return "\n\n".join(results)
+def convert_inputs(input_object: Dict[str, str]) -> Dict[str, list[BaseMessage]]:
+    """
+    Convert user input into the format expected by the agent.
+    Args:
+        input_object: Dictionary containing the user's question
+    Returns:
+        Formatted input state for the agent
+    """
+    return {"messages": [HumanMessage(content=input_object["question"])]}
+def parse_output(input_state: Dict[str, Any]) -> str:
+    """
+    Extract the final response from the agent's state.
+    Args:
+        input_state: The final state of the agent
+    Returns:
+        The content of the last message
+    """
+    try:
+        return cast(str, input_state["messages"][-1].content)
+    except (IndexError, KeyError, AttributeError) as e:
+        # Handle potential errors when accessing the output
+        error_msg = f"Error parsing output: {str(e)}"
+        print(error_msg)  # Log the error
+        return "I encountered an error while processing your request."
+def build_agent_chain(tools, retriever=None):
+    """
+    Constructs and returns the research agent execution chain.
+    The chain consists of:
+    1. A retrieval node that gets context from documents
+    2. An agent node that processes messages
+    3. A tool node that executes tools when called
+    Args:
+        tools: List of tools for the agent
+        retriever: Optional retriever for document search
+    Returns:
+        Compiled agent chain ready for execution
+    """
+    # Create an instance of ChatOpenAI
+    model = ChatOpenAI(model="gpt-4o", temperature=0)
+    model = model.bind_tools(tools)
+    # Create document search tool if retriever is provided
+    if retriever:
+        doc_search_tool = Tool(
+            name="DocumentSearch",
+            description="Search within the user's uploaded document. Use this tool when you need information from the specific document that was uploaded.",
+            func=lambda query: document_search_tool(retriever, query),
+            args_schema=RAGQueryInput
+        )
+        # Add document search tool to the tool belt if we have upload capability
+        tools = tools.copy()
+        tools.append(doc_search_tool)
+    # Create a node for tool execution
+    tool_node = ToolNode(tools)
+    # Initialize the graph with our state type
+    uncompiled_graph = StateGraph(ResearchAgentState)
+    # Define model node factory with bound model
+    def call_model_node(state):
+        return call_model(model, state)
+    # Add nodes
+    if retriever:
+        # Define retrieval node factory with bound retriever
+        def retrieve_node(state):
+            return retrieve_from_documents(state, retriever)
+        uncompiled_graph.add_node("retrieve", retrieve_node)
+        uncompiled_graph.set_entry_point("retrieve")
+        uncompiled_graph.add_edge("retrieve", "agent")
+    else:
+        uncompiled_graph.set_entry_point("agent")
+    uncompiled_graph.add_node("agent", call_model_node)
+    uncompiled_graph.add_node("action", tool_node)
+    # Add an end node - this is required for the "end" state to be valid
+    uncompiled_graph.add_node("end", lambda state: state)
+    # Add conditional edges from agent
+    uncompiled_graph.add_conditional_edges(
+        "agent",
+        should_continue,
+        {
+            "action": "action",
+            "end": END
+        }
+    )
+    # Complete the loop
+    uncompiled_graph.add_edge("action", "agent")
+    # Compile the graph
+    compiled_graph = uncompiled_graph.compile()
+    # Create the full chain
+    agent_chain = convert_inputs | compiled_graph
+    return agent_chain

models/research_tools.py CHANGED Viewed

@@ -1,148 +1,28 @@
 """
 Research tools implementation for the agent.
-This module implements the Tavily search, arXiv, and RAG tools
-that will be used by the research agent.
 """
-import os
-from typing import List, Dict, Any, Optional
-from langchain.agents import tool
-from langchain_core.tools import Tool
-from pydantic import BaseModel, Field  # Updated import from pydantic directly
-from langchain_openai import ChatOpenAI
-from langchain_community.tools.tavily_search import TavilySearchResults
-from langchain_community.utilities.arxiv import ArxivAPIWrapper
-import config
-from models.rag import LangChainRAG
 class ArxivQueryInput(BaseModel):
     """Input for arXiv query."""
     query: str = Field(..., description="The search query to find papers on arXiv")
-    max_results: int = Field(default=config.MAX_ARXIV_SEARCH_RESULTS, description="The maximum number of results to return")
 class RAGQueryInput(BaseModel):
     """Input for RAG query."""
     query: str = Field(..., description="The query to search in the uploaded document")
-def create_tavily_search_tool() -> Tool:
-    """Create a Tavily search tool for the agent."""
-    # Check if TAVILY_API_KEY is in environment variables
-    if "TAVILY_API_KEY" not in os.environ:
-        print("Warning: TAVILY_API_KEY environment variable not set. Web search functionality may be limited.")
-    return TavilySearchResults(max_results=config.MAX_TAVILY_SEARCH_RESULTS)
-@tool
-def arxiv_search(query: str, max_results: int = config.MAX_ARXIV_SEARCH_RESULTS) -> str:
-    """
-    Search for papers on arXiv.
-    Args:
-        query: The search query string
-        max_results: Maximum number of results to return
-    Returns:
-        A string summary of the search results
-    """
-    client = ArxivAPIWrapper(
-        top_k_results=max_results,
-        ARXIV_MAX_QUERY_LENGTH=300,
-        load_max_docs=max_results,
-        load_all_available_meta=True
-    )
-    try:
-        results = client.run(query)
-        if not results:
-            return "No papers found on arXiv for this query."
-        formatted_results = []
-        for idx, result in enumerate(results.split("\n\n")):
-            if result.strip():
-                formatted_results.append(f"[{idx+1}] {result.strip()}")
-        return "\n\n".join(formatted_results)
-    except Exception as e:
-        return f"Error searching arXiv: {str(e)}"
-class ResearchToolkit:
-    """
-    A toolkit of research tools for the agent.
-    """
-    def __init__(self, rag_chain: Optional[LangChainRAG] = None):
-        """
-        Initialize the research toolkit.
-        Args:
-            rag_chain: Optional RAG chain instance
-        """
-        self.rag_chain = rag_chain
-        self.tools = self._create_tools()
-    def _create_tools(self) -> List[Tool]:
-        """
-        Create the tools for the agent.
-        Returns:
-            List of tools
-        """
-        tools = [
-            create_tavily_search_tool(),
-            Tool(
-                name="ArxivSearch",
-                description="Search for scientific papers on arXiv. Use this tool when you need academic or scientific information.",
-                func=arxiv_search,
-                args_schema=ArxivQueryInput
-            )
-        ]
-        # Add RAG tool if available
-        if self.rag_chain:
-            @tool
-            def document_rag_search(query: str) -> str:
-                """
-                Search the uploaded document using RAG.
-                Args:
-                    query: The search query string
-                Returns:
-                    The response from the RAG model
-                """
-                docs = self.rag_chain.retriever.invoke(query)
-                context = "\n\n".join([doc.page_content for doc in docs])
-                response = self.rag_chain.chain.invoke(query)
-                return f"Based on the uploaded document: {response}"
-            tools.append(
-                Tool(
-                    name="DocumentSearch",
-                    description="Search within the user's uploaded document. Use this tool when you need information from the specific document that was uploaded.",
-                    func=document_rag_search,
-                    args_schema=RAGQueryInput
-                )
-            )
-        return tools
-    def get_tools(self) -> List[Tool]:
-        """
-        Get the list of tools.
-        Returns:
-            List of tools
-        """
-        return self.tools
-    def set_rag_chain(self, rag_chain: LangChainRAG):
-        """
-        Update the RAG chain and rebuild tools.
-        Args:
-            rag_chain: New RAG chain instance
-        """
-        self.rag_chain = rag_chain
-        self.tools = self._create_tools()

 """
 Research tools implementation for the agent.
+This module implements input schemas and tools specifically for research purposes.
 """
+from typing import List, Optional
+from pydantic import BaseModel, Field
+from langchain_core.tools import Tool
 class ArxivQueryInput(BaseModel):
     """Input for arXiv query."""
     query: str = Field(..., description="The search query to find papers on arXiv")
+    max_results: int = Field(default=5, description="The maximum number of results to return")
 class RAGQueryInput(BaseModel):
     """Input for RAG query."""
     query: str = Field(..., description="The query to search in the uploaded document")
+class WebSearchInput(BaseModel):
+    """Input for web search."""
+    query: str = Field(..., description="The search query for web search")
+    max_results: int = Field(default=5, description="The maximum number of results to return")
+class DocumentAnalysisInput(BaseModel):
+    """Input for document analysis."""
+    query: str = Field(..., description="The specific question to analyze in the document")
+    include_citations: bool = Field(default=True, description="Whether to include citations in the response")

models/search_tools.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""
+Search tools module containing different search implementations.
+"""
+from langchain_community.tools.tavily_search import TavilySearchResults
+from langchain_community.tools.arxiv.tool import ArxivQueryRun
+from langchain_community.tools import DuckDuckGoSearchResults
+from langchain_core.tools import Tool
+def create_search_tools(max_results=5):
+    """
+    Create search tools for the research agent.
+    Args:
+        max_results: Maximum number of results to return
+    Returns:
+        List of search tools for the agent
+    """
+    # Initialize standard search tools
+    tavily_tool = TavilySearchResults(max_results=max_results)
+    duckduckgo_tool = DuckDuckGoSearchResults(max_results=max_results)
+    arxiv_tool = ArxivQueryRun()
+    return [
+        tavily_tool,
+        duckduckgo_tool,
+        arxiv_tool,
+    ]

pyproject.toml CHANGED Viewed

@@ -8,6 +8,7 @@ dependencies = [
     "arxiv>=2.2.0",
     "chainlit==2.0.4",
     "duckduckgo-search>=8.0.1",
     "langchain>=0.3.23",
     "langchain-community>=0.3.21",
     "langchain-core>=0.3.54",
@@ -16,6 +17,8 @@ dependencies = [
     "langchain-qdrant>=0.2.0",
     "langchain-text-splitters>=0.3.8",
     "langgraph>=0.3.31",
     "numpy==2.2.2",
     "openai==1.59.9",
     "pydantic==2.10.1",

     "arxiv>=2.2.0",
     "chainlit==2.0.4",
     "duckduckgo-search>=8.0.1",
+    "feedparser>=6.0.11",
     "langchain>=0.3.23",
     "langchain-community>=0.3.21",
     "langchain-core>=0.3.54",
     "langchain-qdrant>=0.2.0",
     "langchain-text-splitters>=0.3.8",
     "langgraph>=0.3.31",
+    "listparser>=0.20",
+    "newspaper3k>=0.2.8",
     "numpy==2.2.2",
     "openai==1.59.9",
     "pydantic==2.10.1",

utils/file_processor.py CHANGED Viewed

@@ -4,24 +4,65 @@ Utilities for processing uploaded files.
 import os
 import tempfile
 import shutil
-from typing import List
 from langchain_text_splitters import RecursiveCharacterTextSplitter
-from langchain_community.document_loaders import PyPDFLoader, TextLoader
 from chainlit.types import AskFileResponse
 import config
-# Initialize text splitter
-text_splitter = RecursiveCharacterTextSplitter(
-    chunk_size=config.CHUNK_SIZE,
-    chunk_overlap=config.CHUNK_OVERLAP,
-    length_function=len,
-    is_separator_regex=False,
-    separators=config.SEPARATORS
-)
-def process_file(file: AskFileResponse):
     """
     Process an uploaded file and split it into text chunks.
@@ -29,28 +70,34 @@ def process_file(file: AskFileResponse):
         file: The uploaded file response from Chainlit
     Returns:
-        List of document chunks
     """
     print(f"Processing file: {file.name}")
     # Create a temporary file with the correct extension
     suffix = f".{file.name.split('.')[-1]}"
     with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
-        # Copy the uploaded file content to the temporary file
-        shutil.copyfile(file.path, temp_file.name)
-        print(f"Created temporary file at: {temp_file.name}")
         try:
-            # Create appropriate loader
-            if file.name.lower().endswith('.pdf'):
-                loader = PyPDFLoader(temp_file.name)
-            else:
-                loader = TextLoader(temp_file.name)
-            # Load and process the documents
             documents = loader.load()
             texts = text_splitter.split_documents(documents)
             return texts
         finally:
             # Clean up the temporary file
             try:

 import os
 import tempfile
 import shutil
+from typing import List, Optional
+from pathlib import Path
 from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import (
+    PyPDFLoader,
+    TextLoader,
+    CSVLoader,
+    UnstructuredExcelLoader,
+    Docx2txtLoader
+)
+from langchain_core.documents import Document
 from chainlit.types import AskFileResponse
 import config
+def get_document_loader(file_path: str):
+    """
+    Get appropriate document loader based on file extension.
+    Args:
+        file_path: Path to the file
+    Returns:
+        Document loader instance
+    """
+    file_extension = Path(file_path).suffix.lower()
+    # Select appropriate loader based on file extension
+    if file_extension == '.pdf':
+        return PyPDFLoader(file_path)
+    elif file_extension == '.txt' or file_extension == '.md' or file_extension == '.py':
+        return TextLoader(file_path)
+    elif file_extension == '.csv':
+        return CSVLoader(file_path)
+    elif file_extension == '.xlsx' or file_extension == '.xls':
+        return UnstructuredExcelLoader(file_path)
+    elif file_extension == '.docx' or file_extension == '.doc':
+        return Docx2txtLoader(file_path)
+    else:
+        # Default to text loader
+        return TextLoader(file_path)
+def create_text_splitter():
+    """
+    Create a text splitter with the configured settings.
+    Returns:
+        Initialized text splitter
+    """
+    return RecursiveCharacterTextSplitter(
+        chunk_size=config.CHUNK_SIZE,
+        chunk_overlap=config.CHUNK_OVERLAP,
+        length_function=len,
+        is_separator_regex=False,
+        separators=config.SEPARATORS
+    )
+def process_file(file: AskFileResponse) -> Optional[List[Document]]:
     """
     Process an uploaded file and split it into text chunks.
         file: The uploaded file response from Chainlit
     Returns:
+        List of document chunks or None if processing fails
     """
     print(f"Processing file: {file.name}")
     # Create a temporary file with the correct extension
     suffix = f".{file.name.split('.')[-1]}"
     with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
         try:
+            # Copy the uploaded file content to the temporary file
+            shutil.copyfile(file.path, temp_file.name)
+            print(f"Created temporary file at: {temp_file.name}")
+            # Get the appropriate loader
+            loader = get_document_loader(temp_file.name)
+            # Load documents
             documents = loader.load()
+            # Initialize text splitter
+            text_splitter = create_text_splitter()
+            # Split documents into chunks
             texts = text_splitter.split_documents(documents)
             return texts
+        except Exception as e:
+            print(f"Error processing file: {e}")
+            return None
         finally:
             # Clean up the temporary file
             try:

uv.lock CHANGED Viewed

@@ -10,6 +10,7 @@ dependencies = [
     { name = "arxiv" },
     { name = "chainlit" },
     { name = "duckduckgo-search" },
     { name = "langchain" },
     { name = "langchain-community" },
     { name = "langchain-core" },
@@ -18,6 +19,8 @@ dependencies = [
     { name = "langchain-qdrant" },
     { name = "langchain-text-splitters" },
     { name = "langgraph" },
     { name = "numpy" },
     { name = "openai" },
     { name = "pydantic" },
@@ -33,6 +36,7 @@ requires-dist = [
     { name = "arxiv", specifier = ">=2.2.0" },
     { name = "chainlit", specifier = "==2.0.4" },
     { name = "duckduckgo-search", specifier = ">=8.0.1" },
     { name = "langchain", specifier = ">=0.3.23" },
     { name = "langchain-community", specifier = ">=0.3.21" },
     { name = "langchain-core", specifier = ">=0.3.54" },
@@ -41,6 +45,8 @@ requires-dist = [
     { name = "langchain-qdrant", specifier = ">=0.2.0" },
     { name = "langchain-text-splitters", specifier = ">=0.3.8" },
     { name = "langgraph", specifier = ">=0.3.31" },
     { name = "numpy", specifier = "==2.2.2" },
     { name = "openai", specifier = "==1.59.9" },
     { name = "pydantic", specifier = "==2.10.1" },
@@ -170,6 +176,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/77/06/bb80f5f86020c4551da315d78b3ab75e8228f89f0162f2c3a819e407941a/attrs-25.3.0-py3-none-any.whl", hash = "sha256:427318ce031701fea540783410126f03899a97ffc6f61596ad581ac2e40e3bc3", size = 63815 },
 ]
 [[package]]
 name = "bidict"
 version = "0.23.1"
@@ -295,6 +314,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 },
 ]
 [[package]]
 name = "dataclasses-json"
 version = "0.6.7"
@@ -357,6 +385,17 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/50/b3/b51f09c2ba432a576fe63758bddc81f78f0c6309d9e5c10d194313bf021e/fastapi-0.115.12-py3-none-any.whl", hash = "sha256:e94613d6c05e27be7ffebdd6ea5f388112e5e430c8f7d6494a9d1d88d43e814d", size = 95164 },
 ]
 [[package]]
 name = "feedparser"
 version = "6.0.11"
@@ -369,6 +408,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/7c/d4/8c31aad9cc18f451c49f7f9cfb5799dadffc88177f7917bc90a66459b1d7/feedparser-6.0.11-py3-none-any.whl", hash = "sha256:0be7ee7b395572b19ebeb1d6aafb0028dee11169f1c934e0ed67d54992f4ad45", size = 81343 },
 ]
 [[package]]
 name = "filetype"
 version = "1.2.0"
@@ -602,6 +650,12 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/79/9d/0fb148dc4d6fa4a7dd1d8378168d9b4cd8d4560a6fbf6f0121c5fc34eb68/importlib_metadata-8.6.1-py3-none-any.whl", hash = "sha256:02a89390c1e15fdfdc0d7c6b25cb3e62650d0494005c97d6f148bf5b9787525e", size = 26971 },
 ]
 [[package]]
 name = "jiter"
 version = "0.9.0"
@@ -625,6 +679,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ee/47/3729f00f35a696e68da15d64eb9283c330e776f3b5789bac7f2c0c4df209/jiter-0.9.0-cp313-cp313t-win_amd64.whl", hash = "sha256:6f7838bc467ab7e8ef9f387bd6de195c43bad82a569c1699cb822f6609dd4cdf", size = 206867 },
 ]
 [[package]]
 name = "jsonpatch"
 version = "1.33"
@@ -840,6 +903,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/03/a5/866b44697cee47d1cae429ed370281d937ad4439f71af82a6baaa139d26a/Lazify-0.4.0-py2.py3-none-any.whl", hash = "sha256:c2c17a7a33e9406897e3f66fde4cd3f84716218d580330e5af10cfe5a0cd195a", size = 3107 },
 ]
 [[package]]
 name = "literalai"
 version = "0.1.103"
@@ -950,6 +1022,45 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a0/c4/c2971a3ba4c6103a3d10c4b0f24f461ddc027f0f09763220cf35ca1401b3/nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c", size = 5195 },
 ]
 [[package]]
 name = "numpy"
 version = "2.2.2"
@@ -1172,6 +1283,36 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ec/1a/610693ac4ee14fcdf2d9bf3c493370e4f2ef7ae2e19217d7a237ff42367d/packaging-23.2-py3-none-any.whl", hash = "sha256:8c491190033a9af7e1d931d0b5dacc2ef47509b34dd0de67ed209b5203fc88c7", size = 53011 },
 ]
 [[package]]
 name = "portalocker"
 version = "2.10.1"
@@ -1344,6 +1485,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/8e/5e/c86a5643653825d3c913719e788e41386bee415c2b87b4f955432f2de6b2/pypdf2-3.0.1-py3-none-any.whl", hash = "sha256:d16e4205cfee272fbdc0568b68d82be796540b1537508cef59388f839c191928", size = 232572 },
 ]
 [[package]]
 name = "python-dotenv"
 version = "1.1.0"
@@ -1470,6 +1623,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928 },
 ]
 [[package]]
 name = "requests-toolbelt"
 version = "1.0.0"
@@ -1509,6 +1674,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/52/59/0782e51887ac6b07ffd1570e0364cf901ebc36345fea669969d2084baebb/simple_websocket-1.1.0-py3-none-any.whl", hash = "sha256:4af6069630a38ed6c561010f0e11a5bc0d4ca569b36306eb257cd9a192497c8c", size = 13842 },
 ]
 [[package]]
 name = "sniffio"
 version = "1.3.1"
@@ -1518,6 +1692,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235 },
 ]
 [[package]]
 name = "sqlalchemy"
 version = "2.0.40"
@@ -1598,6 +1781,27 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/de/a8/8f499c179ec900783ffe133e9aab10044481679bb9aad78436d239eee716/tiktoken-0.9.0-cp313-cp313-win_amd64.whl", hash = "sha256:5ea0edb6f83dc56d794723286215918c1cde03712cbbafa0348b33448faf5b95", size = 894669 },
 ]
 [[package]]
 name = "tomli"
 version = "2.2.1"

     { name = "arxiv" },
     { name = "chainlit" },
     { name = "duckduckgo-search" },
+    { name = "feedparser" },
     { name = "langchain" },
     { name = "langchain-community" },
     { name = "langchain-core" },
     { name = "langchain-qdrant" },
     { name = "langchain-text-splitters" },
     { name = "langgraph" },
+    { name = "listparser" },
+    { name = "newspaper3k" },
     { name = "numpy" },
     { name = "openai" },
     { name = "pydantic" },
     { name = "arxiv", specifier = ">=2.2.0" },
     { name = "chainlit", specifier = "==2.0.4" },
     { name = "duckduckgo-search", specifier = ">=8.0.1" },
+    { name = "feedparser", specifier = ">=6.0.11" },
     { name = "langchain", specifier = ">=0.3.23" },
     { name = "langchain-community", specifier = ">=0.3.21" },
     { name = "langchain-core", specifier = ">=0.3.54" },
     { name = "langchain-qdrant", specifier = ">=0.2.0" },
     { name = "langchain-text-splitters", specifier = ">=0.3.8" },
     { name = "langgraph", specifier = ">=0.3.31" },
+    { name = "listparser", specifier = ">=0.20" },
+    { name = "newspaper3k", specifier = ">=0.2.8" },
     { name = "numpy", specifier = "==2.2.2" },
     { name = "openai", specifier = "==1.59.9" },
     { name = "pydantic", specifier = "==2.10.1" },
     { url = "https://files.pythonhosted.org/packages/77/06/bb80f5f86020c4551da315d78b3ab75e8228f89f0162f2c3a819e407941a/attrs-25.3.0-py3-none-any.whl", hash = "sha256:427318ce031701fea540783410126f03899a97ffc6f61596ad581ac2e40e3bc3", size = 63815 },
 ]
+[[package]]
+name = "beautifulsoup4"
+version = "4.13.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "soupsieve" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d8/e4/0c4c39e18fd76d6a628d4dd8da40543d136ce2d1752bd6eeeab0791f4d6b/beautifulsoup4-4.13.4.tar.gz", hash = "sha256:dbb3c4e1ceae6aefebdaf2423247260cd062430a410e38c66f2baa50a8437195", size = 621067 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/50/cd/30110dc0ffcf3b131156077b90e9f60ed75711223f306da4db08eff8403b/beautifulsoup4-4.13.4-py3-none-any.whl", hash = "sha256:9bbbb14bfde9d79f38b8cd5f8c7c85f4b8f2523190ebed90e950a8dea4cb1c4b", size = 187285 },
+]
 [[package]]
 name = "bidict"
 version = "0.23.1"
     { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 },
 ]
+[[package]]
+name = "cssselect"
+version = "1.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/0a/c3ea9573b1dc2e151abfe88c7fe0c26d1892fe6ed02d0cdb30f0d57029d5/cssselect-1.3.0.tar.gz", hash = "sha256:57f8a99424cfab289a1b6a816a43075a4b00948c86b4dcf3ef4ee7e15f7ab0c7", size = 42870 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ee/58/257350f7db99b4ae12b614a36256d9cc870d71d9e451e79c2dc3b23d7c3c/cssselect-1.3.0-py3-none-any.whl", hash = "sha256:56d1bf3e198080cc1667e137bc51de9cadfca259f03c2d4e09037b3e01e30f0d", size = 18786 },
+]
 [[package]]
 name = "dataclasses-json"
 version = "0.6.7"
     { url = "https://files.pythonhosted.org/packages/50/b3/b51f09c2ba432a576fe63758bddc81f78f0c6309d9e5c10d194313bf021e/fastapi-0.115.12-py3-none-any.whl", hash = "sha256:e94613d6c05e27be7ffebdd6ea5f388112e5e430c8f7d6494a9d1d88d43e814d", size = 95164 },
 ]
+[[package]]
+name = "feedfinder2"
+version = "0.0.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "beautifulsoup4" },
+    { name = "requests" },
+    { name = "six" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/35/82/1251fefec3bb4b03fd966c7e7f7a41c9fc2bb00d823a34c13f847fd61406/feedfinder2-0.0.4.tar.gz", hash = "sha256:3701ee01a6c85f8b865a049c30ba0b4608858c803fe8e30d1d289fdbe89d0efe", size = 3297 }
 [[package]]
 name = "feedparser"
 version = "6.0.11"
     { url = "https://files.pythonhosted.org/packages/7c/d4/8c31aad9cc18f451c49f7f9cfb5799dadffc88177f7917bc90a66459b1d7/feedparser-6.0.11-py3-none-any.whl", hash = "sha256:0be7ee7b395572b19ebeb1d6aafb0028dee11169f1c934e0ed67d54992f4ad45", size = 81343 },
 ]
+[[package]]
+name = "filelock"
+version = "3.18.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0a/10/c23352565a6544bdc5353e0b15fc1c563352101f30e24bf500207a54df9a/filelock-3.18.0.tar.gz", hash = "sha256:adbc88eabb99d2fec8c9c1b229b171f18afa655400173ddc653d5d01501fb9f2", size = 18075 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4d/36/2a115987e2d8c300a974597416d9de88f2444426de9571f4b59b2cca3acc/filelock-3.18.0-py3-none-any.whl", hash = "sha256:c401f4f8377c4464e6db25fff06205fd89bdd83b65eb0488ed1b160f780e21de", size = 16215 },
+]
 [[package]]
 name = "filetype"
 version = "1.2.0"
     { url = "https://files.pythonhosted.org/packages/79/9d/0fb148dc4d6fa4a7dd1d8378168d9b4cd8d4560a6fbf6f0121c5fc34eb68/importlib_metadata-8.6.1-py3-none-any.whl", hash = "sha256:02a89390c1e15fdfdc0d7c6b25cb3e62650d0494005c97d6f148bf5b9787525e", size = 26971 },
 ]
+[[package]]
+name = "jieba3k"
+version = "0.35.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a9/cb/2c8332bcdc14d33b0bedd18ae0a4981a069c3513e445120da3c3f23a8aaa/jieba3k-0.35.1.zip", hash = "sha256:980a4f2636b778d312518066be90c7697d410dd5a472385f5afced71a2db1c10", size = 7423646 }
 [[package]]
 name = "jiter"
 version = "0.9.0"
     { url = "https://files.pythonhosted.org/packages/ee/47/3729f00f35a696e68da15d64eb9283c330e776f3b5789bac7f2c0c4df209/jiter-0.9.0-cp313-cp313t-win_amd64.whl", hash = "sha256:6f7838bc467ab7e8ef9f387bd6de195c43bad82a569c1699cb822f6609dd4cdf", size = 206867 },
 ]
+[[package]]
+name = "joblib"
+version = "1.4.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/64/33/60135848598c076ce4b231e1b1895170f45fbcaeaa2c9d5e38b04db70c35/joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e", size = 2116621 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/91/29/df4b9b42f2be0b623cbd5e2140cafcaa2bef0759a00b7b70104dcfe2fb51/joblib-1.4.2-py3-none-any.whl", hash = "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6", size = 301817 },
+]
 [[package]]
 name = "jsonpatch"
 version = "1.33"
     { url = "https://files.pythonhosted.org/packages/03/a5/866b44697cee47d1cae429ed370281d937ad4439f71af82a6baaa139d26a/Lazify-0.4.0-py2.py3-none-any.whl", hash = "sha256:c2c17a7a33e9406897e3f66fde4cd3f84716218d580330e5af10cfe5a0cd195a", size = 3107 },
 ]
+[[package]]
+name = "listparser"
+version = "0.20"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/be/ee/d9f02600955ca34baf73e824d64b181b412745ed448a0ad1a92cef81115b/listparser-0.20.tar.gz", hash = "sha256:0dda5b41ca9531fc3c438eb4abf4d8a7cf03ef050d196875993e897a66c1f885", size = 12404 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c9/27/bd96818acce8ed1909dff29817096016f5e958ef646a377b34d55afa23b3/listparser-0.20-py3-none-any.whl", hash = "sha256:5daae9895b75191a77b14f5b8eabf7a63a4ca440f215d9bd8d8e5a2eccde02ce", size = 14149 },
+]
 [[package]]
 name = "literalai"
 version = "0.1.103"
     { url = "https://files.pythonhosted.org/packages/a0/c4/c2971a3ba4c6103a3d10c4b0f24f461ddc027f0f09763220cf35ca1401b3/nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c", size = 5195 },
 ]
+[[package]]
+name = "newspaper3k"
+version = "0.2.8"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "beautifulsoup4" },
+    { name = "cssselect" },
+    { name = "feedfinder2" },
+    { name = "feedparser" },
+    { name = "jieba3k" },
+    { name = "lxml" },
+    { name = "nltk" },
+    { name = "pillow" },
+    { name = "python-dateutil" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "tinysegmenter" },
+    { name = "tldextract" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ce/fb/8f8525be0cafa48926e85b0c06a7cb3e2a892d340b8036f8c8b1b572df1c/newspaper3k-0.2.8.tar.gz", hash = "sha256:9f1bd3e1fb48f400c715abf875cc7b0a67b7ddcd87f50c9aeeb8fcbbbd9004fb", size = 205685 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d7/b9/51afecb35bb61b188a4b44868001de348a0e8134b4dfa00ffc191567c4b9/newspaper3k-0.2.8-py3-none-any.whl", hash = "sha256:44a864222633d3081113d1030615991c3dbba87239f6bbf59d91240f71a22e3e", size = 211132 },
+]
+[[package]]
+name = "nltk"
+version = "3.9.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "click" },
+    { name = "joblib" },
+    { name = "regex" },
+    { name = "tqdm" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/3c/87/db8be88ad32c2d042420b6fd9ffd4a149f9a0d7f0e86b3f543be2eeeedd2/nltk-3.9.1.tar.gz", hash = "sha256:87d127bd3de4bd89a4f81265e5fa59cb1b199b27440175370f7417d2bc7ae868", size = 2904691 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4d/66/7d9e26593edda06e8cb531874633f7c2372279c3b0f46235539fe546df8b/nltk-3.9.1-py3-none-any.whl", hash = "sha256:4fa26829c5b00715afe3061398a8989dc643b92ce7dd93fb4585a70930d168a1", size = 1505442 },
+]
 [[package]]
 name = "numpy"
 version = "2.2.2"
     { url = "https://files.pythonhosted.org/packages/ec/1a/610693ac4ee14fcdf2d9bf3c493370e4f2ef7ae2e19217d7a237ff42367d/packaging-23.2-py3-none-any.whl", hash = "sha256:8c491190033a9af7e1d931d0b5dacc2ef47509b34dd0de67ed209b5203fc88c7", size = 53011 },
 ]
+[[package]]
+name = "pillow"
+version = "11.2.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/af/cb/bb5c01fcd2a69335b86c22142b2bccfc3464087efb7fd382eee5ffc7fdf7/pillow-11.2.1.tar.gz", hash = "sha256:a64dd61998416367b7ef979b73d3a85853ba9bec4c2925f74e588879a58716b6", size = 47026707 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/36/9c/447528ee3776e7ab8897fe33697a7ff3f0475bb490c5ac1456a03dc57956/pillow-11.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:fdec757fea0b793056419bca3e9932eb2b0ceec90ef4813ea4c1e072c389eb28", size = 3190098 },
+    { url = "https://files.pythonhosted.org/packages/b5/09/29d5cd052f7566a63e5b506fac9c60526e9ecc553825551333e1e18a4858/pillow-11.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:b0e130705d568e2f43a17bcbe74d90958e8a16263868a12c3e0d9c8162690830", size = 3030166 },
+    { url = "https://files.pythonhosted.org/packages/71/5d/446ee132ad35e7600652133f9c2840b4799bbd8e4adba881284860da0a36/pillow-11.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7bdb5e09068332578214cadd9c05e3d64d99e0e87591be22a324bdbc18925be0", size = 4408674 },
+    { url = "https://files.pythonhosted.org/packages/69/5f/cbe509c0ddf91cc3a03bbacf40e5c2339c4912d16458fcb797bb47bcb269/pillow-11.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d189ba1bebfbc0c0e529159631ec72bb9e9bc041f01ec6d3233d6d82eb823bc1", size = 4496005 },
+    { url = "https://files.pythonhosted.org/packages/f9/b3/dd4338d8fb8a5f312021f2977fb8198a1184893f9b00b02b75d565c33b51/pillow-11.2.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:191955c55d8a712fab8934a42bfefbf99dd0b5875078240943f913bb66d46d9f", size = 4518707 },
+    { url = "https://files.pythonhosted.org/packages/13/eb/2552ecebc0b887f539111c2cd241f538b8ff5891b8903dfe672e997529be/pillow-11.2.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:ad275964d52e2243430472fc5d2c2334b4fc3ff9c16cb0a19254e25efa03a155", size = 4610008 },
+    { url = "https://files.pythonhosted.org/packages/72/d1/924ce51bea494cb6e7959522d69d7b1c7e74f6821d84c63c3dc430cbbf3b/pillow-11.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:750f96efe0597382660d8b53e90dd1dd44568a8edb51cb7f9d5d918b80d4de14", size = 4585420 },
+    { url = "https://files.pythonhosted.org/packages/43/ab/8f81312d255d713b99ca37479a4cb4b0f48195e530cdc1611990eb8fd04b/pillow-11.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fe15238d3798788d00716637b3d4e7bb6bde18b26e5d08335a96e88564a36b6b", size = 4667655 },
+    { url = "https://files.pythonhosted.org/packages/94/86/8f2e9d2dc3d308dfd137a07fe1cc478df0a23d42a6c4093b087e738e4827/pillow-11.2.1-cp313-cp313-win32.whl", hash = "sha256:3fe735ced9a607fee4f481423a9c36701a39719252a9bb251679635f99d0f7d2", size = 2332329 },
+    { url = "https://files.pythonhosted.org/packages/6d/ec/1179083b8d6067a613e4d595359b5fdea65d0a3b7ad623fee906e1b3c4d2/pillow-11.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:74ee3d7ecb3f3c05459ba95eed5efa28d6092d751ce9bf20e3e253a4e497e691", size = 2676388 },
+    { url = "https://files.pythonhosted.org/packages/23/f1/2fc1e1e294de897df39fa8622d829b8828ddad938b0eaea256d65b84dd72/pillow-11.2.1-cp313-cp313-win_arm64.whl", hash = "sha256:5119225c622403afb4b44bad4c1ca6c1f98eed79db8d3bc6e4e160fc6339d66c", size = 2414950 },
+    { url = "https://files.pythonhosted.org/packages/c4/3e/c328c48b3f0ead7bab765a84b4977acb29f101d10e4ef57a5e3400447c03/pillow-11.2.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:8ce2e8411c7aaef53e6bb29fe98f28cd4fbd9a1d9be2eeea434331aac0536b22", size = 3192759 },
+    { url = "https://files.pythonhosted.org/packages/18/0e/1c68532d833fc8b9f404d3a642991441d9058eccd5606eab31617f29b6d4/pillow-11.2.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:9ee66787e095127116d91dea2143db65c7bb1e232f617aa5957c0d9d2a3f23a7", size = 3033284 },
+    { url = "https://files.pythonhosted.org/packages/b7/cb/6faf3fb1e7705fd2db74e070f3bf6f88693601b0ed8e81049a8266de4754/pillow-11.2.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9622e3b6c1d8b551b6e6f21873bdcc55762b4b2126633014cea1803368a9aa16", size = 4445826 },
+    { url = "https://files.pythonhosted.org/packages/07/94/8be03d50b70ca47fb434a358919d6a8d6580f282bbb7af7e4aa40103461d/pillow-11.2.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:63b5dff3a68f371ea06025a1a6966c9a1e1ee452fc8020c2cd0ea41b83e9037b", size = 4527329 },
+    { url = "https://files.pythonhosted.org/packages/fd/a4/bfe78777076dc405e3bd2080bc32da5ab3945b5a25dc5d8acaa9de64a162/pillow-11.2.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:31df6e2d3d8fc99f993fd253e97fae451a8db2e7207acf97859732273e108406", size = 4549049 },
+    { url = "https://files.pythonhosted.org/packages/65/4d/eaf9068dc687c24979e977ce5677e253624bd8b616b286f543f0c1b91662/pillow-11.2.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:062b7a42d672c45a70fa1f8b43d1d38ff76b63421cbbe7f88146b39e8a558d91", size = 4635408 },
+    { url = "https://files.pythonhosted.org/packages/1d/26/0fd443365d9c63bc79feb219f97d935cd4b93af28353cba78d8e77b61719/pillow-11.2.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4eb92eca2711ef8be42fd3f67533765d9fd043b8c80db204f16c8ea62ee1a751", size = 4614863 },
+    { url = "https://files.pythonhosted.org/packages/49/65/dca4d2506be482c2c6641cacdba5c602bc76d8ceb618fd37de855653a419/pillow-11.2.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f91ebf30830a48c825590aede79376cb40f110b387c17ee9bd59932c961044f9", size = 4692938 },
+    { url = "https://files.pythonhosted.org/packages/b3/92/1ca0c3f09233bd7decf8f7105a1c4e3162fb9142128c74adad0fb361b7eb/pillow-11.2.1-cp313-cp313t-win32.whl", hash = "sha256:e0b55f27f584ed623221cfe995c912c61606be8513bfa0e07d2c674b4516d9dd", size = 2335774 },
+    { url = "https://files.pythonhosted.org/packages/a5/ac/77525347cb43b83ae905ffe257bbe2cc6fd23acb9796639a1f56aa59d191/pillow-11.2.1-cp313-cp313t-win_amd64.whl", hash = "sha256:36d6b82164c39ce5482f649b437382c0fb2395eabc1e2b1702a6deb8ad647d6e", size = 2681895 },
+    { url = "https://files.pythonhosted.org/packages/67/32/32dc030cfa91ca0fc52baebbba2e009bb001122a1daa8b6a79ad830b38d3/pillow-11.2.1-cp313-cp313t-win_arm64.whl", hash = "sha256:225c832a13326e34f212d2072982bb1adb210e0cc0b153e688743018c94a2681", size = 2417234 },
+]
 [[package]]
 name = "portalocker"
 version = "2.10.1"
     { url = "https://files.pythonhosted.org/packages/8e/5e/c86a5643653825d3c913719e788e41386bee415c2b87b4f955432f2de6b2/pypdf2-3.0.1-py3-none-any.whl", hash = "sha256:d16e4205cfee272fbdc0568b68d82be796540b1537508cef59388f839c191928", size = 232572 },
 ]
+[[package]]
+name = "python-dateutil"
+version = "2.9.0.post0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "six" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892 },
+]
 [[package]]
 name = "python-dotenv"
 version = "1.1.0"
     { url = "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928 },
 ]
+[[package]]
+name = "requests-file"
+version = "2.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "requests" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/72/97/bf44e6c6bd8ddbb99943baf7ba8b1a8485bcd2fe0e55e5708d7fee4ff1ae/requests_file-2.1.0.tar.gz", hash = "sha256:0f549a3f3b0699415ac04d167e9cb39bccfb730cb832b4d20be3d9867356e658", size = 6891 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d7/25/dd878a121fcfdf38f52850f11c512e13ec87c2ea72385933818e5b6c15ce/requests_file-2.1.0-py2.py3-none-any.whl", hash = "sha256:cf270de5a4c5874e84599fc5778303d496c10ae5e870bfa378818f35d21bda5c", size = 4244 },
+]
 [[package]]
 name = "requests-toolbelt"
 version = "1.0.0"
     { url = "https://files.pythonhosted.org/packages/52/59/0782e51887ac6b07ffd1570e0364cf901ebc36345fea669969d2084baebb/simple_websocket-1.1.0-py3-none-any.whl", hash = "sha256:4af6069630a38ed6c561010f0e11a5bc0d4ca569b36306eb257cd9a192497c8c", size = 13842 },
 ]
+[[package]]
+name = "six"
+version = "1.17.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050 },
+]
 [[package]]
 name = "sniffio"
 version = "1.3.1"
     { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235 },
 ]
+[[package]]
+name = "soupsieve"
+version = "2.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d7/ce/fbaeed4f9fb8b2daa961f90591662df6a86c1abf25c548329a86920aedfb/soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb", size = 101569 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d1/c2/fe97d779f3ef3b15f05c94a2f1e3d21732574ed441687474db9d342a7315/soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9", size = 36186 },
+]
 [[package]]
 name = "sqlalchemy"
 version = "2.0.40"
     { url = "https://files.pythonhosted.org/packages/de/a8/8f499c179ec900783ffe133e9aab10044481679bb9aad78436d239eee716/tiktoken-0.9.0-cp313-cp313-win_amd64.whl", hash = "sha256:5ea0edb6f83dc56d794723286215918c1cde03712cbbafa0348b33448faf5b95", size = 894669 },
 ]
+[[package]]
+name = "tinysegmenter"
+version = "0.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/17/82/86982e4b6d16e4febc79c2a1d68ee3b707e8a020c5d2bc4af8052d0f136a/tinysegmenter-0.3.tar.gz", hash = "sha256:ed1f6d2e806a4758a73be589754384cbadadc7e1a414c81a166fc9adf2d40c6d", size = 16893 }
+[[package]]
+name = "tldextract"
+version = "5.2.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "filelock" },
+    { name = "idna" },
+    { name = "requests" },
+    { name = "requests-file" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/20/7a/e469c4f71231a848492da31a7be6921a6cd04ecc8eed58e924bece0fb6de/tldextract-5.2.0.tar.gz", hash = "sha256:c3a8c4daf2c25a57f54d6ef6762aeac7eff5ac3da04cdb607130be757b8457ab", size = 126839 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5e/20/b400e99827439eb91d5aa283e09d43e7e46aba66b07edf6f09404cb741da/tldextract-5.2.0-py3-none-any.whl", hash = "sha256:59509cbf99628c9440f4d19d3a1fd8488d50297ea23879c136576263c5a04eba", size = 106308 },
+]
 [[package]]
 name = "tomli"
 version = "2.2.1"