jensenwiedler commited on
Commit
ccce173
·
1 Parent(s): 3945599

basic agent with 30 score

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .env
2
+ node_modules
3
+ whisper-large-v3
agent/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (185 Bytes). View file
 
agent/__pycache__/graph.cpython-312.pyc ADDED
Binary file (2.15 kB). View file
 
agent/__pycache__/tools.cpython-312.pyc CHANGED
Binary files a/agent/__pycache__/tools.cpython-312.pyc and b/agent/__pycache__/tools.cpython-312.pyc differ
 
agent/graph.py CHANGED
@@ -1,23 +1,40 @@
 
1
  from langgraph.graph import StateGraph, MessagesState, START, END
2
- from langchain_core.messages import HumanMessage, AIMessage
3
  from langgraph.prebuilt import ToolNode, tools_condition
4
-
5
- from tools import TOOLS
6
 
7
  class State(MessagesState):
8
- file_name: str
9
 
10
- def retriever(state: State):
11
- if state.file_name:
12
- # Simulate file retrieval
13
- return {"file_content": f"Retrieved content from {state.file_name}"}
14
 
15
  def call_model(state: State):
16
- return {"messages": [AIMessage(content="Hello! How can I assist you today?")]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
 
19
  def build_agent():
20
- graph_builder = StateGraph(MessagesState)
21
 
22
  graph_builder.add_node("call_model", call_model)
23
  graph_builder.add_node("tools", ToolNode(TOOLS))
 
1
+ from typing import Annotated, Optional
2
  from langgraph.graph import StateGraph, MessagesState, START, END
3
+ from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
4
  from langgraph.prebuilt import ToolNode, tools_condition
5
+ from langchain_ollama import ChatOllama
6
+ from agent.tools import TOOLS
7
 
8
  class State(MessagesState):
9
+ file_path: str
10
 
11
+ model = ChatOllama(model="qwen3:32b")
12
+ #model = ChatOllama(model="llama3.2:3b")
13
+ model_with_tools = model.bind_tools(TOOLS)
 
14
 
15
  def call_model(state: State):
16
+ return {"messages": [AIMessage(content="FINAL ANSWER: right")]}
17
+ system_prompt = """
18
+ You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
19
+ YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
20
+ If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
21
+ If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
22
+
23
+ Instructions for the tools:
24
+ - If you need information from the web, you must use both the web_search and wikipedia_search tools, unless the question mentions wikipedia. Then, you must use only the wikipedia_search tool.
25
+
26
+ Do not forget to use the FINAl ANSWER: [YOUR FINAL ANSWER] template!!!
27
+ """
28
+ if state["file_path"] and state["file_path"] != "":
29
+ system_prompt += f"\n\nYou have acces to a file at {state['file_path']}. You can use it to answer the question. Use this file path as input to relevant tools."
30
+
31
+ result = model_with_tools.invoke([SystemMessage(content=system_prompt)] + state["messages"])
32
+
33
+ return {"messages": [result]}
34
 
35
 
36
  def build_agent():
37
+ graph_builder = StateGraph(State)
38
 
39
  graph_builder.add_node("call_model", call_model)
40
  graph_builder.add_node("tools", ToolNode(TOOLS))
agent/tools.py CHANGED
@@ -3,39 +3,60 @@ from langchain_core.tools import tool
3
  from langchain_community.document_loaders import WikipediaLoader, YoutubeLoader
4
  from langchain_community.tools import DuckDuckGoSearchResults
5
  from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
 
 
 
 
 
 
 
 
 
 
 
6
  @tool
7
  def wikipedia_search(query: str) -> str:
8
  """
9
- Search Wikipedia for a given query and return max 2 results.
10
 
11
  Args:
12
  query: The search query.
13
  """
14
  # Simulate a search operation
15
- search_docs = WikipediaLoader(query=query, load_max_docs=2).load()
16
-
 
 
 
 
 
 
 
17
  formatted_docs = "\n\n---\n\n".join(
18
  [
19
- f'<Document title="{doc.metadata["title"]}"/>\n{doc.page_content}\n</Document>'
20
- for doc in search_docs
21
  ])
22
  return formatted_docs
23
 
24
  @tool
25
  def youtube_transcript(url: str) -> str:
26
  """"Returns the transcript of a YouTube video given its URL.
 
27
  Args:
28
  url: The YouTube video URL.
29
  """
30
- try:
31
- transcripts = YoutubeLoader.from_youtube_url(url, add_video_info=False).load()
32
- return f"Video Transcript: {transcripts[0].page_content}"
33
- except Exception as e:
34
- return "No transcript available for this video. Error: {e}"
 
 
 
 
 
35
 
36
- wrapper = DuckDuckGoSearchAPIWrapper(max_results=5)
37
-
38
- search = DuckDuckGoSearchResults(output_format="list", api_wrapper=wrapper)
39
 
40
  @tool
41
  def web_search(query: str) -> str:
@@ -46,13 +67,17 @@ def web_search(query: str) -> str:
46
  query: The search query.
47
  """
48
  # Simulate a web search operation
49
- query = "obama"
50
- search_results = search.invoke(query)
51
- formatted_result = "\n\n---\n\n".join([
52
- f"- {result['title']}: {result['link']} \n {result['snippet']}"
53
- for result in search_results
54
- ])
55
- return f"Web search results for '{query}'"
 
 
 
 
56
 
57
  @tool
58
  def add_numbers(numbers: List[float]) -> float:
@@ -76,5 +101,144 @@ def multiply_numbers(numbers: List[float]) -> float:
76
  return result
77
 
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- TOOLS = [wikipedia_search, web_search, youtube_transcript, add_numbers, multiply_numbers]
 
3
  from langchain_community.document_loaders import WikipediaLoader, YoutubeLoader
4
  from langchain_community.tools import DuckDuckGoSearchResults
5
  from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
6
+ from langchain_ollama import ChatOllama
7
+ from langchain_sandbox import PyodideSandbox
8
+ import base64
9
+ from langchain_core.messages import HumanMessage, SystemMessage
10
+ import torch
11
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
12
+ from docling.document_converter import DocumentConverter
13
+ from langchain_tavily import TavilySearch
14
+
15
+ doc_converter = DocumentConverter()
16
+
17
  @tool
18
  def wikipedia_search(query: str) -> str:
19
  """
20
+ Search Wikipedia for a given query and return max 1 result.
21
 
22
  Args:
23
  query: The search query.
24
  """
25
  # Simulate a search operation
26
+ search_docs = WikipediaLoader(query=query, load_max_docs=1).load()
27
+ docling_docs = [doc_converter.convert(doc.metadata["source"]).document.export_to_markdown() for doc in search_docs]
28
+ start_indexes = []
29
+ for d in docling_docs:
30
+ start_index = d.find("From Wikipedia")
31
+ if start_index != -1:
32
+ start_indexes.append(start_index)
33
+ else:
34
+ start_indexes.append(0)
35
  formatted_docs = "\n\n---\n\n".join(
36
  [
37
+ f'<Document title="{search_doc.metadata["title"]}"/>\n{docling_doc[start_index:]}\n</Document>'
38
+ for search_doc, docling_doc, start_index in zip(search_docs, docling_docs, start_indexes)
39
  ])
40
  return formatted_docs
41
 
42
  @tool
43
  def youtube_transcript(url: str) -> str:
44
  """"Returns the transcript of a YouTube video given its URL.
45
+ This is a text-based tool and should not be used for visual information of the video.
46
  Args:
47
  url: The YouTube video URL.
48
  """
49
+ max_tries = 3
50
+ for _ in range(max_tries):
51
+ try:
52
+ transcripts = YoutubeLoader.from_youtube_url(url, add_video_info=False).load()
53
+ return f"Video Transcript: {transcripts[0].page_content}"
54
+ except Exception as e:
55
+ print(f"Attempt failed: {e}")
56
+ continue
57
+ # If all attempts fail, return an error message
58
+ return "No transcript available. This video might not have a transcript or the URL is invalid."
59
 
 
 
 
60
 
61
  @tool
62
  def web_search(query: str) -> str:
 
67
  query: The search query.
68
  """
69
  # Simulate a web search operation
70
+ tavily_search = TavilySearch(max_results=3)
71
+ search_docs = tavily_search.invoke(query)
72
+
73
+ # Format
74
+ formatted_search_docs = "\n\n---\n\n".join(
75
+ [
76
+ f'<Document href="{doc["url"]}">\n{doc["content"]}\n</Document>'
77
+ for doc in search_docs["results"]
78
+ ]
79
+ )
80
+ return f"Web search results for '{query}':\n\n{formatted_search_docs}"
81
 
82
  @tool
83
  def add_numbers(numbers: List[float]) -> float:
 
101
  return result
102
 
103
 
104
+ vision_llm = ChatOllama(model="gemma3:27b")
105
+
106
+ # might be better to use supervisor method..
107
+ @tool
108
+ def image_question_answering(img_path: str, question: str) -> str:
109
+ """
110
+ Given an image path and a question, return the answer to the question based on the image. Just pass the initial question from the human as a query.
111
+ Args:
112
+ img_path: The path to the image.
113
+ question: The question to ask about the image.
114
+ """
115
+ system_prompt = """
116
+ You are a helpful assistant that can answer questions about images.
117
+ You need to think step by step carefully, provide your thinking process and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]
118
+ """
119
+
120
+ try:
121
+ # Read image and encode as base64
122
+ with open(img_path, "rb") as image_file:
123
+ image_bytes = image_file.read()
124
+
125
+ image_base64 = base64.b64encode(image_bytes).decode("utf-8")
126
+
127
+ question = "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation."
128
+
129
+ # Prepare the prompt including the base64 image data
130
+ message = [
131
+ SystemMessage(content=system_prompt),
132
+ HumanMessage(
133
+ content=[
134
+ {
135
+ "type": "text",
136
+ "text": question,
137
+ },
138
+ {
139
+ "type": "image_url",
140
+ "image_url": {
141
+ "url": f"data:image/png;base64,{image_base64}"
142
+ },
143
+ },
144
+ ]
145
+ )
146
+ ]
147
+
148
+ # Call the vision-capable model
149
+ response = vision_llm.invoke(message)
150
+
151
+ return response.content
152
+
153
+ except Exception as e:
154
+ error_msg = f"Error image questioning: {str(e)}"
155
+ print(error_msg)
156
+ return error_msg
157
+
158
+ device = "mps"
159
+ checkpoint = "./whisper-large-v3"
160
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
161
+ checkpoint, torch_dtype=torch.float32, low_cpu_mem_usage=True, use_safetensors=True
162
+ )
163
+ model.to(device)
164
+ processor = AutoProcessor.from_pretrained(checkpoint)
165
+ pipe = pipeline(
166
+ "automatic-speech-recognition",
167
+ model=model,
168
+ tokenizer=processor.tokenizer,
169
+ feature_extractor=processor.feature_extractor,
170
+ torch_dtype=torch.float32,
171
+ device=device,
172
+ )
173
+
174
+ @tool
175
+ def speech_to_text(audio_path: str) -> str:
176
+ """
177
+ Convert speech to text using a given audio file. Not for youtube links.
178
+ Args:
179
+ audio_path: The path to the audio file.
180
+ """
181
+ try:
182
+ result = pipe(audio_path)
183
+ return result["text"].strip()
184
+ except Exception as e:
185
+ result = pipe(audio_path, return_timestamps=True)
186
+ return result["text"].strip()
187
+ except Exception as e:
188
+ return f"Error processing audio file: {str(e)}"
189
+
190
+ @tool
191
+ def read_file_content(path: str) -> str:
192
+ """
193
+ Read the content of a file (pdf, docs, xlsx, etc.) but also from a URL (like arxiv or websites) and returns it as markdown.
194
+ Args:
195
+ file_path: The path to the file.
196
+ """
197
+ try:
198
+ doc = doc_converter.convert(path).document
199
+ markdown = doc.export_to_markdown()
200
+ return f"File Content:\n\n{markdown}"
201
+ except Exception as e:
202
+ return f"Error reading file: {str(e)}"
203
+
204
+ sandbox = PyodideSandbox(
205
+ # Allow Pyodide to install python packages that
206
+ # might be required.
207
+ allow_net=True,
208
+ )
209
+
210
+ @tool
211
+ async def run_python_code(input_type: str, input: str) -> str:
212
+ """
213
+ Run Python code in a sandboxed environment. You can provide either a code snippet or a file path.
214
+ 1. If input_type is "code", input should be a string containing the Python code to run.
215
+ 2. If input_type is "file", input should be a string containing the path to the file.
216
+ Args:
217
+ input_type: The type of input, code or file.
218
+ input: The Python code to run or the path to the file.
219
+ """
220
+ try:
221
+ if input_type == "code":
222
+ code = input
223
+ elif input_type == "file":
224
+ with open(input, "r") as file:
225
+ code = file.read()
226
+ else:
227
+ return "Invalid input type. Please provide 'code' or 'file' as input_type."
228
+ result = await sandbox.execute(code)
229
+ return f"Result execution: result: {result.result}, stdout: {result.stdout}, stderr: {result.stderr}, status: {result.status}"
230
+ except Exception as e:
231
+ return f"Error executing Python code: {str(e)}"
232
+
233
+ @tool
234
+ def reverse_string(input: str) -> str:
235
+ """
236
+ Reverse a given string.
237
+ Args:
238
+ input: The string to reverse.
239
+ """
240
+ return input[::-1]
241
+
242
+
243
 
244
+ TOOLS = [wikipedia_search, web_search, youtube_transcript, add_numbers, multiply_numbers , image_question_answering, speech_to_text, read_file_content, run_python_code, reverse_string]
app.py CHANGED
@@ -1,13 +1,16 @@
1
- from io import BytesIO
2
  import os
3
  import gradio as gr
4
  import requests
5
- import inspect
6
  import pandas as pd
7
 
8
  from langchain_core.messages import HumanMessage
9
  from agent.graph import build_agent
10
 
 
 
 
11
  # (Keep Constants as is)
12
  # --- Constants ---
13
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
@@ -17,22 +20,31 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
17
  class BasicAgent:
18
  def __init__(self):
19
  self.agent = build_agent()
20
- def __call__(self, question: str, task_id: str, file_name="") -> str:
21
  messages = [HumanMessage(content=question)]
22
 
23
  if file_name:
24
- task_id = "cca530fc-4052-43b2-b130-b30968d8aa44"
25
  response = requests.get(f"{DEFAULT_API_URL}/files/{task_id}", timeout=15)
26
  response.raise_for_status()
27
  file_data = response.content
28
- #file_data = BytesIO(file_data)
29
-
30
-
31
- state = self.agent.invoke({"messages": messages, "file_name": file_name})
 
 
 
 
32
  answer = state["messages"][-1].content
 
 
 
 
 
 
33
  return answer
34
 
35
- def run_and_submit_all( profile: gr.OAuthProfile | None):
36
  """
37
  Fetches all questions, runs the BasicAgent on them, submits all answers,
38
  and displays the results.
@@ -94,7 +106,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
94
  print(f"Skipping item with missing task_id or question: {item}")
95
  continue
96
  try:
97
- submitted_answer = agent(question_text, file_name=file_name)
98
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
99
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
100
  except Exception as e:
@@ -207,4 +219,9 @@ if __name__ == "__main__":
207
  print("-"*(60 + len(" App Starting ")) + "\n")
208
 
209
  print("Launching Gradio Interface for Basic Agent Evaluation...")
210
- demo.launch(debug=True, share=False)
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
  import os
3
  import gradio as gr
4
  import requests
5
+ import asyncio
6
  import pandas as pd
7
 
8
  from langchain_core.messages import HumanMessage
9
  from agent.graph import build_agent
10
 
11
+
12
+ load_dotenv()
13
+
14
  # (Keep Constants as is)
15
  # --- Constants ---
16
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 
20
  class BasicAgent:
21
  def __init__(self):
22
  self.agent = build_agent()
23
+ async def __call__(self, question: str, task_id: str, file_name="") -> str:
24
  messages = [HumanMessage(content=question)]
25
 
26
  if file_name:
 
27
  response = requests.get(f"{DEFAULT_API_URL}/files/{task_id}", timeout=15)
28
  response.raise_for_status()
29
  file_data = response.content
30
+ # write to temp location
31
+ with open(file_name, "wb") as f:
32
+ f.write(file_data)
33
+ state = {"messages": messages, "file_path": file_name}
34
+ print(f"question: {question}")
35
+ state = await self.agent.ainvoke(state)
36
+ for msg in state["messages"]:
37
+ msg.pretty_print()
38
  answer = state["messages"][-1].content
39
+ try:
40
+ answer = answer.split("FINAL ANSWER: ")[-1].strip()
41
+ except Exception as e:
42
+ print(f"Error parsing answer: {e}")
43
+ answer = "AGENT ERROR: Unable to parse answer."
44
+ print(f"answer: {answer}")
45
  return answer
46
 
47
+ async def run_and_submit_all( profile: gr.OAuthProfile | None):
48
  """
49
  Fetches all questions, runs the BasicAgent on them, submits all answers,
50
  and displays the results.
 
106
  print(f"Skipping item with missing task_id or question: {item}")
107
  continue
108
  try:
109
+ submitted_answer = await agent(question_text, task_id=task_id, file_name=file_name)
110
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
111
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
112
  except Exception as e:
 
219
  print("-"*(60 + len(" App Starting ")) + "\n")
220
 
221
  print("Launching Gradio Interface for Basic Agent Evaluation...")
222
+ demo.launch(debug=True, share=False)
223
+
224
+ # agent = BasicAgent()
225
+
226
+ # res = asyncio.run(agent("Hello, how are you?", "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3", "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3"))
227
+ # print(res)
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  gradio
 
2
  requests
3
  langgraph
4
  langchain
@@ -6,4 +7,7 @@ langchain-community
6
  wikipedia
7
  youtube-transcript-api
8
  duckduckgo-search
9
- docling
 
 
 
 
1
  gradio
2
+ gradio[oauth]
3
  requests
4
  langgraph
5
  langchain
 
7
  wikipedia
8
  youtube-transcript-api
9
  duckduckgo-search
10
+ langchain-docling
11
+ langchain-sandbox
12
+ langchain-ollama
13
+ langchain-tavily