Groq Model Update
Browse files- exports/gaia_results_20260104_064016.json +110 -0
- output/gaia_results_20260104_005858.md +0 -35
- output/gaia_results_20260104_061543.json +110 -0
- src/agent/llm_client.py +238 -175
exports/gaia_results_20260104_064016.json
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"generated": "2026-01-04 06:40:16",
|
| 4 |
+
"timestamp": "20260104_064016",
|
| 5 |
+
"total_questions": 20
|
| 6 |
+
},
|
| 7 |
+
"submission_status": "Submission Successful!\nUser: mangoobee\nOverall Score: 5.0% (1/20 correct)\nMessage: Score calculated successfully: 1/20 total questions answered correctly (20 valid tasks attempted). Score did not improve previous record, leaderboard not updated.",
|
| 8 |
+
"results": [
|
| 9 |
+
{
|
| 10 |
+
"task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
|
| 11 |
+
"question": "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.",
|
| 12 |
+
"submitted_answer": "Unable to answer"
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
|
| 16 |
+
"question": "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",
|
| 17 |
+
"submitted_answer": "ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Gemini and Claude both failed"
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"task_id": "2d83110e-a098-4ebb-9987-066c06fa42d0",
|
| 21 |
+
"question": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",
|
| 22 |
+
"submitted_answer": "Unable to answer"
|
| 23 |
+
},
|
| 24 |
+
{
|
| 25 |
+
"task_id": "cca530fc-4052-43b2-b130-b30968d8aa44",
|
| 26 |
+
"question": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.",
|
| 27 |
+
"submitted_answer": "ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Gemini and Claude both failed"
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
"task_id": "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8",
|
| 31 |
+
"question": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
|
| 32 |
+
"submitted_answer": "<think>\nOkay, let's tackle this question. The user is asking who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016.\n\nFirst, I need to look through the provided evidence to find any mention of a dinosaur-related article promoted in November 2016. The evidence includes several web search results from Wikipedia pages related to November 2016 featured articles.\n\nLooking at the first result from \"Wikipedia:Today's featured article/November 2016\", the snippet mentions various topics like a RAAF officer, General Patton, a video game, and a Star Trek film. There's no mention of a dinosaur here.\n\nThe second result from \"Wikipedia:Today's featured article/recent TFAs/November 2016\" lists entries for November dates in 2016. The entries include \"Mughal-e-Azam\", \"Star Trek: First Contact\", \"Warlugulong\", \"Banksia canei\", \"SMS Lützow\", and \"Scotland national football team\". None of these seem to be about a dinosaur. Warlugulong is an art piece, and Banksia canei is a plant. SMS Lützow is a"
|
| 33 |
+
},
|
| 34 |
+
{
|
| 35 |
+
"task_id": "6f37996b-2ac7-44b0-8e68-6d28256631b4",
|
| 36 |
+
"question": "Given this table defining * on the set S = {a, b, c, d, e}\n\n|*|a|b|c|d|e|\n|---|---|---|---|---|---|\n|a|a|b|c|b|d|\n|b|b|c|a|e|c|\n|c|c|a|b|b|a|\n|d|b|e|b|e|d|\n|e|d|b|a|d|c|\n\nprovide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.",
|
| 37 |
+
"submitted_answer": "b, d, e"
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"task_id": "9d191bce-651d-4746-be2d-7ef8ecadb9c2",
|
| 41 |
+
"question": "Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.\n\nWhat does Teal'c say in response to the question \"Isn't that hot?\"",
|
| 42 |
+
"submitted_answer": "ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Gemini and Claude both failed"
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"task_id": "cabe07ed-9eca-40ea-8ead-410ef5e83f91",
|
| 46 |
+
"question": "What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?",
|
| 47 |
+
"submitted_answer": "Unable to answer"
|
| 48 |
+
},
|
| 49 |
+
{
|
| 50 |
+
"task_id": "3cef3a44-215e-4aed-8e3b-b1e3f08063b7",
|
| 51 |
+
"question": "I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:\n\nmilk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts\n\nI need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.",
|
| 52 |
+
"submitted_answer": "acorns, bell pepper, broccoli, celery, green beans, lettuce, zucchini"
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"task_id": "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3",
|
| 56 |
+
"question": "Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.\n\nIn your response, please only list the ingredients, not any measurements. So if the recipe calls for \"a pinch of salt\" or \"two cups of ripe strawberries\" the ingredients on the list would be \"salt\" and \"ripe strawberries\".\n\nPlease format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.",
|
| 57 |
+
"submitted_answer": "ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Gemini and Claude both failed"
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"task_id": "305ac316-eef6-4446-960a-92d80d542f82",
|
| 61 |
+
"question": "Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.",
|
| 62 |
+
"submitted_answer": "<think>\nOkay, let's tackle this question. The user is asking who the actor who played Ray in the Polish-language version of \"Everybody Loves Raymond\" played in \"Magda M.,\" and they want just the first name.\n\nFirst, I need to look through the provided evidence to find the actor's name. The key part here is identifying the Polish dubbing actor for Ray. Let's check the evidence snippets.\n\nLooking at Evidence 1, the web search results. The first result mentions Peter Boyle as Frank Barone in the sitcom, but that's the original US version. The second result from Screen Rant says Ray Romano plays Ray Barone. The third result from Yahoo also mentions Ray Romano as Ray Barone. The fourth result is an Instagram post titled \"Polish Dubbing Cast Everybody Loves Raymond Ray\" but the snippet doesn't mention the actor's name. The fifth result is a YouTube video about Ray Romano, but that's the original actor.\n\nWait, the user is asking about the Polish-language version. The original actor is Ray Romano, but in Polish dubbing, a different actor would voice him. However, none of the evidence snippets explicitly state the name of the Polish actor who dubbed Ray Romano. The fourth result's"
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
"task_id": "f918266a-b3e0-4914-865d-4faa564f1aef",
|
| 66 |
+
"question": "What is the final numeric output from the attached Python code?",
|
| 67 |
+
"submitted_answer": "ERROR: No evidence collected. Details: Tool parse_file failed: ValueError: Unsupported file type: . Supported: .pdf, .xlsx, .xls, .docx, .txt, .csv"
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"task_id": "3f57289b-8c60-48be-bd80-01f8099ca449",
|
| 71 |
+
"question": "How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?",
|
| 72 |
+
"submitted_answer": "<think>\nOkay, let's see. The user is asking about the number of at-bats for the Yankee player who had the most walks in the 1977 regular season. \n\nFirst, I need to figure out who that player is. The evidence provided includes a few sources. The first one is the Wikipedia page for the 1977 Yankees season, but the snippet doesn't mention walks. The third result is from ESPN's MLB Pitching Leaders, but that's about pitchers' walks, not hitters. The fourth result is from MLB.com's 1977 Yankees stats. Let me check that snippet.\n\nLooking at the snippet from MLB.com, there's a table with player stats. The first entry is Graig Nettles with 158 games and 589 at-bats. Then Chris Chambliss has 157 games and 600 at-bats. The walks (BB) for Nettles are 37, and for Chambliss it's 17. Wait, that can't be right. Wait, maybe I'm misreading. Let me check again. The columns are labeled BB (walks) and SO (strikeouts). For Graig Nettles"
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"task_id": "1f975693-876d-457b-a649-393859e79bf3",
|
| 76 |
+
"question": "Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(\n\nCould you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.",
|
| 77 |
+
"submitted_answer": "Unable to answer"
|
| 78 |
+
},
|
| 79 |
+
{
|
| 80 |
+
"task_id": "840bfca7-4f7b-481a-8794-c560c340185d",
|
| 81 |
+
"question": "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?",
|
| 82 |
+
"submitted_answer": "Unable to answer"
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"task_id": "bda648d7-d618-4883-88f4-3466eabd860e",
|
| 86 |
+
"question": "Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited? Just give me the city name without abbreviations.",
|
| 87 |
+
"submitted_answer": "<think>\nOkay, let's tackle this question. The user is asking where the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper were eventually deposited. They want just the city name without abbreviations.\n\nFirst, I need to look through the provided evidence to find any mention of deposition locations. Let's go through each of the web search results.\n\nEvidence 1's snippet talks about specimens collected at Mê Linh biological station in North Vietnam, Vinh Phuc Province. But it doesn't mention where they were deposited. The other entries in Evidence 1's results mention Sapa, Lao Cai Province, but that's about museum specimens from Vietnam in general, not specifically Kuznetzov's. The third result mentions the Field Museum of Natural History, Australian National Wildlife Collection, etc., but again, not specific to Kuznetzov's specimens. The fourth result is about central Vietnam specimens being different but doesn't mention deposition. The last result is about Nedoshivina's Google Scholar profile, which doesn't help here.\n\nLooking back, the first result's snippet mentions Mê Linh biological station where specimens were collected. However, the question is about where they were deposited, not where"
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"task_id": "cf106601-ab4f-4af9-b045-5295fe67b37d",
|
| 91 |
+
"question": "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.",
|
| 92 |
+
"submitted_answer": "CUB"
|
| 93 |
+
},
|
| 94 |
+
{
|
| 95 |
+
"task_id": "a0c07678-e491-4bbc-8f0b-07405144218f",
|
| 96 |
+
"question": "Who are the pitchers with the number before and after Taishō Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.",
|
| 97 |
+
"submitted_answer": "<think>\nOkay, let's see. The user is asking for the pitchers with the numbers immediately before and after Taishō Tamai's number as of July 2023, using their last names in Roman characters.\n\nFirst, I need to find out what number Taishō Tamai wears. Looking at the evidence provided, the fifth result from the web search shows a 2023 Hokkaido Nippon-Ham Fighters Individual Pitching list. The entry for Tamai, Taisho is listed with the number 50. So his number is 50.\n\nNext, I need to check the numbers before and after 50. The same list in the fifth result shows Tanaka, Eito with number 2, Tanaka, Seigi with 47, and Tateno, Kazuaki with 4. Wait, that doesn't seem to be in order. Maybe the list isn't sorted by numbers. Let me check again. The snippet shows \"Tamai, Taisho, 50 ; Tanaka, Eito, 2 ; Tanaka, Seigi, 47 ; Tateno, Kazuaki, 4\". Hmm, that's not a sequential list. Maybe"
|
| 98 |
+
},
|
| 99 |
+
{
|
| 100 |
+
"task_id": "7bd855d8-463d-4ed5-93ca-5fe35145f733",
|
| 101 |
+
"question": "The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.",
|
| 102 |
+
"submitted_answer": "ERROR: No evidence collected. Details: Tool parse_file failed: ValueError: Unsupported file type: . Supported: .pdf, .xlsx, .xls, .docx, .txt, .csv"
|
| 103 |
+
},
|
| 104 |
+
{
|
| 105 |
+
"task_id": "5a0c1adf-205e-4841-a666-7c3ef95def9d",
|
| 106 |
+
"question": "What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?",
|
| 107 |
+
"submitted_answer": "Unable to answer"
|
| 108 |
+
}
|
| 109 |
+
]
|
| 110 |
+
}
|
output/gaia_results_20260104_005858.md
DELETED
|
@@ -1,35 +0,0 @@
|
|
| 1 |
-
# GAIA Agent Evaluation Results
|
| 2 |
-
|
| 3 |
-
**Generated:** 2026-01-04 00:58:58
|
| 4 |
-
|
| 5 |
-
## Submission Status
|
| 6 |
-
|
| 7 |
-
Submission Successful!
|
| 8 |
-
User: mangoobee
|
| 9 |
-
Overall Score: 5.0% (1/20 correct)
|
| 10 |
-
Message: Score calculated successfully: 1/20 total questions answered correctly (20 valid tasks attempted). Score did not improve previous record, leaderboard not updated.
|
| 11 |
-
|
| 12 |
-
## Questions and Answers
|
| 13 |
-
|
| 14 |
-
| Task ID | Question | Submitted Answer |
|
| 15 |
-
|---------|----------|------------------|
|
| 16 |
-
| 8e867cd7-cff9-4e6c-867a-ff5ddc2550be | How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can ... | Unable to answer |
|
| 17 |
-
| a1e91b78-d3d8-4675-bb8d-62741b4b68a6 | In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird spec... | ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Ge... |
|
| 18 |
-
| 2d83110e-a098-4ebb-9987-066c06fa42d0 | .rewsna eht sa "tfel" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI | ERROR: No evidence collected. Details: Tool selection returned no tools - using fallback keyword ... |
|
| 19 |
-
| cca530fc-4052-43b2-b130-b30968d8aa44 | Review the chess position provided in the image. It is black's turn. Provide the correct next mov... | ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Ge... |
|
| 20 |
-
| 4fc2f1ae-8625-45b5-ab34-ad4433bc21f8 | Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted i... | FunkMonk |
|
| 21 |
-
| 6f37996b-2ac7-44b0-8e68-6d28256631b4 | Given this table defining * on the set S = {a, b, c, d, e} \|*\|a\|b\|c\|d\|e\| \|---\|---\|---\... | ERROR: No evidence collected. Details: Planning error: Exception: Planning failed with all LLMs. ... |
|
| 22 |
-
| 9d191bce-651d-4746-be2d-7ef8ecadb9c2 | Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec. What does Teal'c say in respon... | ERROR: No evidence collected. Details: Planning error: Exception: Planning failed with all LLMs. ... |
|
| 23 |
-
| cabe07ed-9eca-40ea-8ead-410ef5e83f91 | What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry mate... | ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 Yo... |
|
| 24 |
-
| 3cef3a44-215e-4aed-8e3b-b1e3f08063b7 | I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler w... | ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 Yo... |
|
| 25 |
-
| 99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3 | Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need fo... | ERROR: No evidence collected. Details: Planning error: Exception: Planning failed with all LLMs. ... |
|
| 26 |
-
| 305ac316-eef6-4446-960a-92d80d542f82 | Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play i... | ERROR: No evidence collected. Details: Planning error: Exception: Planning failed with all LLMs. ... |
|
| 27 |
-
| f918266a-b3e0-4914-865d-4faa564f1aef | What is the final numeric output from the attached Python code? | ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 Yo... |
|
| 28 |
-
| 3f57289b-8c60-48be-bd80-01f8099ca449 | How many at bats did the Yankee with the most walks in the 1977 regular season have that same sea... | 589 |
|
| 29 |
-
| 1f975693-876d-457b-a649-393859e79bf3 | Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study fo... | ERROR: No evidence collected. Details: Tool parse_file failed: ValueError: Unsupported file type:... |
|
| 30 |
-
| 840bfca7-4f7b-481a-8794-c560c340185d | On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This art... | Unable to answer |
|
| 31 |
-
| bda648d7-d618-4883-88f4-3466eabd860e | Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually... | Unable to answer |
|
| 32 |
-
| cf106601-ab4f-4af9-b045-5295fe67b37d | What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a... | ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 Yo... |
|
| 33 |
-
| a0c07678-e491-4bbc-8f0b-07405144218f | Who are the pitchers with the number before and after Taishō Tamai's number as of July 2023? Give... | ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 Yo... |
|
| 34 |
-
| 7bd855d8-463d-4ed5-93ca-5fe35145f733 | The attached Excel file contains the sales of menu items for a local fast-food chain. What were t... | ERROR: No evidence collected. Details: Planning error: Exception: Planning failed with all LLMs. ... |
|
| 35 |
-
| 5a0c1adf-205e-4841-a666-7c3ef95def9d | What is the first name of the only Malko Competition recipient from the 20th Century (after 1977)... | ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 Yo... |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
output/gaia_results_20260104_061543.json
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"generated": "2026-01-04 06:15:43",
|
| 4 |
+
"timestamp": "20260104_061543",
|
| 5 |
+
"total_questions": 20
|
| 6 |
+
},
|
| 7 |
+
"submission_status": "Submission Successful!\nUser: mangoobee\nOverall Score: 5.0% (1/20 correct)\nMessage: Score calculated successfully: 1/20 total questions answered correctly (20 valid tasks attempted). Score did not improve previous record, leaderboard not updated.",
|
| 8 |
+
"results": [
|
| 9 |
+
{
|
| 10 |
+
"task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
|
| 11 |
+
"question": "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.",
|
| 12 |
+
"submitted_answer": "Unable to answer"
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
|
| 16 |
+
"question": "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",
|
| 17 |
+
"submitted_answer": "ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Gemini and Claude both failed"
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"task_id": "2d83110e-a098-4ebb-9987-066c06fa42d0",
|
| 21 |
+
"question": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",
|
| 22 |
+
"submitted_answer": "right"
|
| 23 |
+
},
|
| 24 |
+
{
|
| 25 |
+
"task_id": "cca530fc-4052-43b2-b130-b30968d8aa44",
|
| 26 |
+
"question": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.",
|
| 27 |
+
"submitted_answer": "ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Gemini and Claude both failed"
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
"task_id": "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8",
|
| 31 |
+
"question": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
|
| 32 |
+
"submitted_answer": "ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. \n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.0-flash-exp\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.0-flash-exp\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_input_token_count, limit: 0, model: gemini-2.0-flash-exp\nPlease retry in 492.952507ms. [links {\n description: \"Learn more about Gemini API quotas\"\n url: \"https://ai.google.dev/gemini-api/docs/rate-limits\"\n}\n, violations {\n quota_metric: \"generativelanguage.googleapis.com/generate_content_free_tier_requests\"\n quota_id: \"GenerateRequestsPerDayPerProjectPerModel-FreeTier\"\n quota_dimensions {\n key: \"model\"\n value: \"gemini-2.0-flash-exp\"\n }\n quota_dimensions {\n key: \"location\"\n value: \"global\"\n }\n}\nviolations {\n quota_metric: \"generativelanguage.googleapis.com/generate_content_free_tier_requests\"\n quota_id: \"GenerateRequestsPerMinutePerProjectPerModel-FreeTier\"\n quota_dimensions {\n key: \"model\"\n value: \"gemini-2.0-flash-exp\"\n }\n quota_dimensions {\n key: \"location\"\n value: \"global\"\n }\n}\nviolations {\n quota_metric: \"generativelanguage.googleapis.com/generate_content_free_tier_input_token_count\"\n quota_id: \"GenerateContentInputTokensPerModelPerMinute-FreeTier\"\n quota_dimensions {\n key: \"model\"\n value: \"gemini-2.0-flash-exp\"\n }\n quota_dimensions {\n key: \"location\"\n value: \"global\"\n }\n}\n, retry_delay {\n}\n], HF: Client error '402 Payment Required' for url 'https://router.huggingface.co/v1/chat/completions' (Request ID: Root=1-6959f69f-41685c16026af93414725956;de9f587c-8725-431a-99b2-f8b7835a5561)\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402\n\nYou have reached the free monthly usage limit for novita. Subscribe to PRO to get 20x more included usage, or add pre-paid credits to your account., Groq: Error code: 400 - {'error': {'message': 'The model `llama-3.1-70b-versatile` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}, Claude: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits.'}, 'request_id': 'req_011CWmmGokhbSbmayAs4y5Bz'}"
|
| 33 |
+
},
|
| 34 |
+
{
|
| 35 |
+
"task_id": "6f37996b-2ac7-44b0-8e68-6d28256631b4",
|
| 36 |
+
"question": "Given this table defining * on the set S = {a, b, c, d, e}\n\n|*|a|b|c|d|e|\n|---|---|---|---|---|---|\n|a|a|b|c|b|d|\n|b|b|c|a|e|c|\n|c|c|a|b|b|a|\n|d|b|e|b|e|d|\n|e|d|b|a|d|c|\n\nprovide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.",
|
| 37 |
+
"submitted_answer": "ERROR: No evidence collected. Details: Planning error: Exception: Planning failed with all LLMs. Gemini: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. \n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_input_token_count, limit: 0, model: gemini-2.0-flash-exp\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.0-flash-exp\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.0-flash-exp\nPlease retry in 56.98168557s. [links {\n description: \"Learn more about Gemini API quotas\"\n url: \"https://ai.google.dev/gemini-api/docs/rate-limits\"\n}\n, violations {\n quota_metric: \"generativelanguage.googleapis.com/generate_content_free_tier_input_token_count\"\n quota_id: \"GenerateContentInputTokensPerModelPerMinute-FreeTier\"\n quota_dimensions {\n key: \"model\"\n value: \"gemini-2.0-flash-exp\"\n }\n quota_dimensions {\n key: \"location\"\n value: \"global\"\n }\n}\nviolations {\n quota_metric: \"generativelanguage.googleapis.com/generate_content_free_tier_requests\"\n quota_id: \"GenerateRequestsPerMinutePerProjectPerModel-FreeTier\"\n quota_dimensions {\n key: \"model\"\n value: \"gemini-2.0-flash-exp\"\n }\n quota_dimensions {\n key: \"location\"\n value: \"global\"\n }\n}\nviolations {\n quota_metric: \"generativelanguage.googleapis.com/generate_content_free_tier_requests\"\n quota_id: \"GenerateRequestsPerDayPerProjectPerModel-FreeTier\"\n quota_dimensions {\n key: \"model\"\n value: \"gemini-2.0-flash-exp\"\n }\n quota_dimensions {\n key: \"location\"\n value: \"global\"\n }\n}\n, retry_delay {\n seconds: 56\n}\n], HF: Client error '402 Payment Required' for url 'https://router.huggingface.co/v1/chat/completions' (Request ID: Root=1-6959f6a3-27b6720e7fddfb820d63b308;5de2df42-8300-4055-8b24-064192f1acf4)\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402\n\nYou have reached the free monthly usage limit for novita. Subscribe to PRO to get 20x more included usage, or add pre-paid credits to your account., Groq: Error code: 400 - {'error': {'message': 'The model `llama-3.1-70b-versatile` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}, Claude: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits.'}, 'request_id': 'req_011CWmmH4NHSqUTCDGpEyRyn'}; Tool calculator failed: ValueError: signal only works in main thread of the main interpreter"
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"task_id": "9d191bce-651d-4746-be2d-7ef8ecadb9c2",
|
| 41 |
+
"question": "Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.\n\nWhat does Teal'c say in response to the question \"Isn't that hot?\"",
|
| 42 |
+
"submitted_answer": "ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Gemini and Claude both failed"
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"task_id": "cabe07ed-9eca-40ea-8ead-410ef5e83f91",
|
| 46 |
+
"question": "What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?",
|
| 47 |
+
"submitted_answer": "Agnew"
|
| 48 |
+
},
|
| 49 |
+
{
|
| 50 |
+
"task_id": "3cef3a44-215e-4aed-8e3b-b1e3f08063b7",
|
| 51 |
+
"question": "I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:\n\nmilk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts\n\nI need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.",
|
| 52 |
+
"submitted_answer": "sweet potatoes, green beans, broccoli, celery, zucchini, lettuce, acorns, bell pepper, corn, rice, peanuts, flour, eggs, milk, whole bean coffee, Oreos, fresh basil, plums, whole allspice\n\nHowever, considering the specific requirement to exclude botanical fruits and include only vegetables, and to alphabetize the list, here is the refined list:\n\nacorns, bell pepper, broccoli, celery, corn, green beans, lettuce, peanuts, rice, sweet potatoes, zucchini"
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"task_id": "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3",
|
| 56 |
+
"question": "Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.\n\nIn your response, please only list the ingredients, not any measurements. So if the recipe calls for \"a pinch of salt\" or \"two cups of ripe strawberries\" the ingredients on the list would be \"salt\" and \"ripe strawberries\".\n\nPlease format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.",
|
| 57 |
+
"submitted_answer": "ERROR: No evidence collected. Details: Tool parse_file failed: ValueError: Unsupported file type: .mp3. Supported: .pdf, .xlsx, .xls, .docx, .txt, .csv"
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"task_id": "305ac316-eef6-4446-960a-92d80d542f82",
|
| 61 |
+
"question": "Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.",
|
| 62 |
+
"submitted_answer": "Unable to answer"
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
"task_id": "f918266a-b3e0-4914-865d-4faa564f1aef",
|
| 66 |
+
"question": "What is the final numeric output from the attached Python code?",
|
| 67 |
+
"submitted_answer": "ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. \n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.0-flash-exp\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.0-flash-exp\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_input_token_count, limit: 0, model: gemini-2.0-flash-exp\nPlease retry in 2.511765611s. [links {\n description: \"Learn more about Gemini API quotas\"\n url: \"https://ai.google.dev/gemini-api/docs/rate-limits\"\n}\n, violations {\n quota_metric: \"generativelanguage.googleapis.com/generate_content_free_tier_requests\"\n quota_id: \"GenerateRequestsPerDayPerProjectPerModel-FreeTier\"\n quota_dimensions {\n key: \"model\"\n value: \"gemini-2.0-flash-exp\"\n }\n quota_dimensions {\n key: \"location\"\n value: \"global\"\n }\n}\nviolations {\n quota_metric: \"generativelanguage.googleapis.com/generate_content_free_tier_requests\"\n quota_id: \"GenerateRequestsPerMinutePerProjectPerModel-FreeTier\"\n quota_dimensions {\n key: \"model\"\n value: \"gemini-2.0-flash-exp\"\n }\n quota_dimensions {\n key: \"location\"\n value: \"global\"\n }\n}\nviolations {\n quota_metric: \"generativelanguage.googleapis.com/generate_content_free_tier_input_token_count\"\n quota_id: \"GenerateContentInputTokensPerModelPerMinute-FreeTier\"\n quota_dimensions {\n key: \"model\"\n value: \"gemini-2.0-flash-exp\"\n }\n quota_dimensions {\n key: \"location\"\n value: \"global\"\n }\n}\n, retry_delay {\n seconds: 2\n}\n], HF: Client error '402 Payment Required' for url 'https://router.huggingface.co/v1/chat/completions' (Request ID: Root=1-6959f715-3234013d1b53716e2ab85ef7;a9fb7ffc-c425-4599-a90c-4dc51440ecdd)\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402\n\nYou have reached the free monthly usage limit for novita. Subscribe to PRO to get 20x more included usage, or add pre-paid credits to your account., Groq: Error code: 400 - {'error': {'message': 'The model `llama-3.1-70b-versatile` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}, Claude: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits.'}, 'request_id': 'req_011CWmmRVtRKKg4FQLFMzeQR'}"
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"task_id": "3f57289b-8c60-48be-bd80-01f8099ca449",
|
| 71 |
+
"question": "How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?",
|
| 72 |
+
"submitted_answer": "589"
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"task_id": "1f975693-876d-457b-a649-393859e79bf3",
|
| 76 |
+
"question": "Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(\n\nCould you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.",
|
| 77 |
+
"submitted_answer": "ERROR: No evidence collected. Details: Tool parse_file failed: ValueError: Unsupported file type: .mp3. Supported: .pdf, .xlsx, .xls, .docx, .txt, .csv"
|
| 78 |
+
},
|
| 79 |
+
{
|
| 80 |
+
"task_id": "840bfca7-4f7b-481a-8794-c560c340185d",
|
| 81 |
+
"question": "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?",
|
| 82 |
+
"submitted_answer": "Unable to answer"
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"task_id": "bda648d7-d618-4883-88f4-3466eabd860e",
|
| 86 |
+
"question": "Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited? Just give me the city name without abbreviations.",
|
| 87 |
+
"submitted_answer": "ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. \n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_input_token_count, limit: 0, model: gemini-2.0-flash-exp\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.0-flash-exp\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.0-flash-exp\nPlease retry in 6.933570206s. [links {\n description: \"Learn more about Gemini API quotas\"\n url: \"https://ai.google.dev/gemini-api/docs/rate-limits\"\n}\n, violations {\n quota_metric: \"generativelanguage.googleapis.com/generate_content_free_tier_input_token_count\"\n quota_id: \"GenerateContentInputTokensPerModelPerMinute-FreeTier\"\n quota_dimensions {\n key: \"model\"\n value: \"gemini-2.0-flash-exp\"\n }\n quota_dimensions {\n key: \"location\"\n value: \"global\"\n }\n}\nviolations {\n quota_metric: \"generativelanguage.googleapis.com/generate_content_free_tier_requests\"\n quota_id: \"GenerateRequestsPerMinutePerProjectPerModel-FreeTier\"\n quota_dimensions {\n key: \"model\"\n value: \"gemini-2.0-flash-exp\"\n }\n quota_dimensions {\n key: \"location\"\n value: \"global\"\n }\n}\nviolations {\n quota_metric: \"generativelanguage.googleapis.com/generate_content_free_tier_requests\"\n quota_id: \"GenerateRequestsPerDayPerProjectPerModel-FreeTier\"\n quota_dimensions {\n key: \"model\"\n value: \"gemini-2.0-flash-exp\"\n }\n quota_dimensions {\n key: \"location\"\n value: \"global\"\n }\n}\n, retry_delay {\n seconds: 6\n}\n], HF: Client error '402 Payment Required' for url 'https://router.huggingface.co/v1/chat/completions' (Request ID: Root=1-6959f74d-3b67a13544a874f84a51ede0;a1e97094-bfbe-4ac6-a8c3-f4825d1ff54c)\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402\n\nYou have reached the free monthly usage limit for novita. Subscribe to PRO to get 20x more included usage, or add pre-paid credits to your account., Groq: Error code: 400 - {'error': {'message': 'The model `llama-3.1-70b-versatile` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}, Claude: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits.'}, 'request_id': 'req_011CWmmVbYr4mBof1pxst7xt'}"
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"task_id": "cf106601-ab4f-4af9-b045-5295fe67b37d",
|
| 91 |
+
"question": "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.",
|
| 92 |
+
"submitted_answer": "ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. \n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_input_token_count, limit: 0, model: gemini-2.0-flash-exp\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.0-flash-exp\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.0-flash-exp\nPlease retry in 56.632557269s. [links {\n description: \"Learn more about Gemini API quotas\"\n url: \"https://ai.google.dev/gemini-api/docs/rate-limits\"\n}\n, violations {\n quota_metric: \"generativelanguage.googleapis.com/generate_content_free_tier_input_token_count\"\n quota_id: \"GenerateContentInputTokensPerModelPerMinute-FreeTier\"\n quota_dimensions {\n key: \"model\"\n value: \"gemini-2.0-flash-exp\"\n }\n quota_dimensions {\n key: \"location\"\n value: \"global\"\n }\n}\nviolations {\n quota_metric: \"generativelanguage.googleapis.com/generate_content_free_tier_requests\"\n quota_id: \"GenerateRequestsPerMinutePerProjectPerModel-FreeTier\"\n quota_dimensions {\n key: \"model\"\n value: \"gemini-2.0-flash-exp\"\n }\n quota_dimensions {\n key: \"location\"\n value: \"global\"\n }\n}\nviolations {\n quota_metric: \"generativelanguage.googleapis.com/generate_content_free_tier_requests\"\n quota_id: \"GenerateRequestsPerDayPerProjectPerModel-FreeTier\"\n quota_dimensions {\n key: \"model\"\n value: \"gemini-2.0-flash-exp\"\n }\n quota_dimensions {\n key: \"location\"\n value: \"global\"\n }\n}\n, retry_delay {\n seconds: 56\n}\n], HF: Client error '402 Payment Required' for url 'https://router.huggingface.co/v1/chat/completions' (Request ID: Root=1-6959f757-43b0f00c7032c65c0f1e1a1c;576ff5a1-2b1b-4aa9-a78f-8c0d61b5c142)\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402\n\nYou have reached the free monthly usage limit for novita. Subscribe to PRO to get 20x more included usage, or add pre-paid credits to your account., Groq: Error code: 400 - {'error': {'message': 'The model `llama-3.1-70b-versatile` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}, Claude: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits.'}, 'request_id': 'req_011CWmmWMTJTnEMwg8y3YCMY'}"
|
| 93 |
+
},
|
| 94 |
+
{
|
| 95 |
+
"task_id": "a0c07678-e491-4bbc-8f0b-07405144218f",
|
| 96 |
+
"question": "Who are the pitchers with the number before and after Taishō Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.",
|
| 97 |
+
"submitted_answer": "Unable to answer"
|
| 98 |
+
},
|
| 99 |
+
{
|
| 100 |
+
"task_id": "7bd855d8-463d-4ed5-93ca-5fe35145f733",
|
| 101 |
+
"question": "The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.",
|
| 102 |
+
"submitted_answer": "ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. \n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.0-flash-exp\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.0-flash-exp\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_input_token_count, limit: 0, model: gemini-2.0-flash-exp\nPlease retry in 29.604589104s. [links {\n description: \"Learn more about Gemini API quotas\"\n url: \"https://ai.google.dev/gemini-api/docs/rate-limits\"\n}\n, violations {\n quota_metric: \"generativelanguage.googleapis.com/generate_content_free_tier_requests\"\n quota_id: \"GenerateRequestsPerDayPerProjectPerModel-FreeTier\"\n quota_dimensions {\n key: \"model\"\n value: \"gemini-2.0-flash-exp\"\n }\n quota_dimensions {\n key: \"location\"\n value: \"global\"\n }\n}\nviolations {\n quota_metric: \"generativelanguage.googleapis.com/generate_content_free_tier_requests\"\n quota_id: \"GenerateRequestsPerMinutePerProjectPerModel-FreeTier\"\n quota_dimensions {\n key: \"model\"\n value: \"gemini-2.0-flash-exp\"\n }\n quota_dimensions {\n key: \"location\"\n value: \"global\"\n }\n}\nviolations {\n quota_metric: \"generativelanguage.googleapis.com/generate_content_free_tier_input_token_count\"\n quota_id: \"GenerateContentInputTokensPerModelPerMinute-FreeTier\"\n quota_dimensions {\n key: \"model\"\n value: \"gemini-2.0-flash-exp\"\n }\n quota_dimensions {\n key: \"location\"\n value: \"global\"\n }\n}\n, retry_delay {\n seconds: 29\n}\n], HF: Client error '402 Payment Required' for url 'https://router.huggingface.co/v1/chat/completions' (Request ID: Root=1-6959f772-2efe8bf7611376d52370066f;3d15f77a-fa3d-44f4-b9e4-1348bd7b9395)\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402\n\nYou have reached the free monthly usage limit for novita. Subscribe to PRO to get 20x more included usage, or add pre-paid credits to your account., Groq: Error code: 400 - {'error': {'message': 'The model `llama-3.1-70b-versatile` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}, Claude: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits.'}, 'request_id': 'req_011CWmmYM1p29xiWtktGgkS2'}"
|
| 103 |
+
},
|
| 104 |
+
{
|
| 105 |
+
"task_id": "5a0c1adf-205e-4841-a666-7c3ef95def9d",
|
| 106 |
+
"question": "What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?",
|
| 107 |
+
"submitted_answer": "ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. \n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_input_token_count, limit: 0, model: gemini-2.0-flash-exp\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.0-flash-exp\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.0-flash-exp\nPlease retry in 17.145959071s. [links {\n description: \"Learn more about Gemini API quotas\"\n url: \"https://ai.google.dev/gemini-api/docs/rate-limits\"\n}\n, violations {\n quota_metric: \"generativelanguage.googleapis.com/generate_content_free_tier_input_token_count\"\n quota_id: \"GenerateContentInputTokensPerModelPerMinute-FreeTier\"\n quota_dimensions {\n key: \"model\"\n value: \"gemini-2.0-flash-exp\"\n }\n quota_dimensions {\n key: \"location\"\n value: \"global\"\n }\n}\nviolations {\n quota_metric: \"generativelanguage.googleapis.com/generate_content_free_tier_requests\"\n quota_id: \"GenerateRequestsPerMinutePerProjectPerModel-FreeTier\"\n quota_dimensions {\n key: \"model\"\n value: \"gemini-2.0-flash-exp\"\n }\n quota_dimensions {\n key: \"location\"\n value: \"global\"\n }\n}\nviolations {\n quota_metric: \"generativelanguage.googleapis.com/generate_content_free_tier_requests\"\n quota_id: \"GenerateRequestsPerDayPerProjectPerModel-FreeTier\"\n quota_dimensions {\n key: \"model\"\n value: \"gemini-2.0-flash-exp\"\n }\n quota_dimensions {\n key: \"location\"\n value: \"global\"\n }\n}\n, retry_delay {\n seconds: 17\n}\n], HF: Client error '402 Payment Required' for url 'https://router.huggingface.co/v1/chat/completions' (Request ID: Root=1-6959f77e-2589e16a2d8444713b7f15c8;12f33ac5-b138-4b33-a615-35a768036af8)\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402\n\nYou have reached the free monthly usage limit for novita. Subscribe to PRO to get 20x more included usage, or add pre-paid credits to your account., Groq: Error code: 400 - {'error': {'message': 'The model `llama-3.1-70b-versatile` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}, Claude: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits.'}, 'request_id': 'req_011CWmmZGCcuUZwobyWLJu3N'}"
|
| 108 |
+
}
|
| 109 |
+
]
|
| 110 |
+
}
|
src/agent/llm_client.py
CHANGED
|
@@ -38,7 +38,7 @@ HF_MODEL = "Qwen/Qwen2.5-72B-Instruct" # Excellent for function calling and rea
|
|
| 38 |
# Alternatives: "meta-llama/Llama-3.1-70B-Instruct", "NousResearch/Hermes-3-Llama-3.1-70B"
|
| 39 |
|
| 40 |
# Groq Configuration
|
| 41 |
-
GROQ_MODEL = "
|
| 42 |
# Alternatives: "llama-3.1-8b-instant", "mixtral-8x7b-32768"
|
| 43 |
|
| 44 |
# Shared Configuration
|
|
@@ -54,6 +54,7 @@ logger = logging.getLogger(__name__)
|
|
| 54 |
# Retry Logic with Exponential Backoff
|
| 55 |
# ============================================================================
|
| 56 |
|
|
|
|
| 57 |
def retry_with_backoff(func: Callable, max_retries: int = 3) -> Any:
|
| 58 |
"""
|
| 59 |
Retry function with exponential backoff on quota errors.
|
|
@@ -81,15 +82,15 @@ def retry_with_backoff(func: Callable, max_retries: int = 3) -> Any:
|
|
| 81 |
|
| 82 |
# Check if this is a quota/rate limit error
|
| 83 |
is_quota_error = (
|
| 84 |
-
"429" in str(e)
|
| 85 |
-
"quota" in error_str
|
| 86 |
-
"rate limit" in error_str
|
| 87 |
-
"too many requests" in error_str
|
| 88 |
)
|
| 89 |
|
| 90 |
if is_quota_error and attempt < max_retries - 1:
|
| 91 |
# Exponential backoff: 1s, 2s, 4s
|
| 92 |
-
wait_time = 2
|
| 93 |
logger.warning(
|
| 94 |
f"Quota/rate limit error (attempt {attempt + 1}/{max_retries}): {e}. "
|
| 95 |
f"Retrying in {wait_time}s..."
|
|
@@ -100,10 +101,12 @@ def retry_with_backoff(func: Callable, max_retries: int = 3) -> Any:
|
|
| 100 |
# If not a quota error, or last attempt, raise immediately
|
| 101 |
raise
|
| 102 |
|
|
|
|
| 103 |
# ============================================================================
|
| 104 |
# Client Initialization
|
| 105 |
# ============================================================================
|
| 106 |
|
|
|
|
| 107 |
def create_claude_client() -> Anthropic:
|
| 108 |
"""Initialize Anthropic client with API key from environment."""
|
| 109 |
api_key = os.getenv("ANTHROPIC_API_KEY")
|
|
@@ -149,10 +152,11 @@ def create_groq_client() -> Groq:
|
|
| 149 |
# Planning Functions - Claude Implementation
|
| 150 |
# ============================================================================
|
| 151 |
|
|
|
|
| 152 |
def plan_question_claude(
|
| 153 |
question: str,
|
| 154 |
available_tools: Dict[str, Dict],
|
| 155 |
-
file_paths: Optional[List[str]] = None
|
| 156 |
) -> str:
|
| 157 |
"""Analyze question and generate execution plan using Claude."""
|
| 158 |
client = create_claude_client()
|
|
@@ -168,7 +172,9 @@ def plan_question_claude(
|
|
| 168 |
# File context
|
| 169 |
file_context = ""
|
| 170 |
if file_paths:
|
| 171 |
-
file_context = f"\n\nAvailable files:\n" + "\n".join(
|
|
|
|
|
|
|
| 172 |
|
| 173 |
# Prompt for planning
|
| 174 |
system_prompt = """You are a planning agent for answering complex questions.
|
|
@@ -197,7 +203,7 @@ Create an execution plan to answer this question. Format as numbered steps."""
|
|
| 197 |
max_tokens=MAX_TOKENS,
|
| 198 |
temperature=TEMPERATURE,
|
| 199 |
system=system_prompt,
|
| 200 |
-
messages=[{"role": "user", "content": user_prompt}]
|
| 201 |
)
|
| 202 |
|
| 203 |
plan = response.content[0].text
|
|
@@ -210,10 +216,11 @@ Create an execution plan to answer this question. Format as numbered steps."""
|
|
| 210 |
# Planning Functions - Gemini Implementation
|
| 211 |
# ============================================================================
|
| 212 |
|
|
|
|
| 213 |
def plan_question_gemini(
|
| 214 |
question: str,
|
| 215 |
available_tools: Dict[str, Dict],
|
| 216 |
-
file_paths: Optional[List[str]] = None
|
| 217 |
) -> str:
|
| 218 |
"""Analyze question and generate execution plan using Gemini."""
|
| 219 |
model = create_gemini_client()
|
|
@@ -229,7 +236,9 @@ def plan_question_gemini(
|
|
| 229 |
# File context
|
| 230 |
file_context = ""
|
| 231 |
if file_paths:
|
| 232 |
-
file_context = f"\n\nAvailable files:\n" + "\n".join(
|
|
|
|
|
|
|
| 233 |
|
| 234 |
# Combined prompt (Gemini doesn't use separate system/user like Claude)
|
| 235 |
prompt = f"""You are a planning agent for answering complex questions.
|
|
@@ -256,9 +265,8 @@ Create an execution plan to answer this question. Format as numbered steps."""
|
|
| 256 |
response = model.generate_content(
|
| 257 |
prompt,
|
| 258 |
generation_config=genai.types.GenerationConfig(
|
| 259 |
-
temperature=TEMPERATURE,
|
| 260 |
-
|
| 261 |
-
)
|
| 262 |
)
|
| 263 |
|
| 264 |
plan = response.text
|
|
@@ -271,10 +279,11 @@ Create an execution plan to answer this question. Format as numbered steps."""
|
|
| 271 |
# Planning Functions - HuggingFace Implementation
|
| 272 |
# ============================================================================
|
| 273 |
|
|
|
|
| 274 |
def plan_question_hf(
|
| 275 |
question: str,
|
| 276 |
available_tools: Dict[str, Dict],
|
| 277 |
-
file_paths: Optional[List[str]] = None
|
| 278 |
) -> str:
|
| 279 |
"""Analyze question and generate execution plan using HuggingFace Inference API."""
|
| 280 |
client = create_hf_client()
|
|
@@ -290,7 +299,9 @@ def plan_question_hf(
|
|
| 290 |
# File context
|
| 291 |
file_context = ""
|
| 292 |
if file_paths:
|
| 293 |
-
file_context = f"\n\nAvailable files:\n" + "\n".join(
|
|
|
|
|
|
|
| 294 |
|
| 295 |
# System message for Qwen 2.5 (supports system/user format)
|
| 296 |
system_prompt = """You are a planning agent for answering complex questions.
|
|
@@ -317,13 +328,11 @@ Create an execution plan to answer this question. Format as numbered steps."""
|
|
| 317 |
# HuggingFace Inference API chat completion
|
| 318 |
messages = [
|
| 319 |
{"role": "system", "content": system_prompt},
|
| 320 |
-
{"role": "user", "content": user_prompt}
|
| 321 |
]
|
| 322 |
|
| 323 |
response = client.chat_completion(
|
| 324 |
-
messages=messages,
|
| 325 |
-
max_tokens=MAX_TOKENS,
|
| 326 |
-
temperature=TEMPERATURE
|
| 327 |
)
|
| 328 |
|
| 329 |
plan = response.choices[0].message.content
|
|
@@ -336,10 +345,11 @@ Create an execution plan to answer this question. Format as numbered steps."""
|
|
| 336 |
# Planning Functions - Groq Implementation
|
| 337 |
# ============================================================================
|
| 338 |
|
|
|
|
| 339 |
def plan_question_groq(
|
| 340 |
question: str,
|
| 341 |
available_tools: Dict[str, Dict],
|
| 342 |
-
file_paths: Optional[List[str]] = None
|
| 343 |
) -> str:
|
| 344 |
"""Analyze question and generate execution plan using Groq."""
|
| 345 |
client = create_groq_client()
|
|
@@ -355,7 +365,9 @@ def plan_question_groq(
|
|
| 355 |
# File context
|
| 356 |
file_context = ""
|
| 357 |
if file_paths:
|
| 358 |
-
file_context = f"\n\nAvailable files:\n" + "\n".join(
|
|
|
|
|
|
|
| 359 |
|
| 360 |
# System message for Llama 3.1 (supports system/user format)
|
| 361 |
system_prompt = """You are a planning agent for answering complex questions.
|
|
@@ -382,14 +394,14 @@ Create an execution plan to answer this question. Format as numbered steps."""
|
|
| 382 |
# Groq uses OpenAI-compatible API
|
| 383 |
messages = [
|
| 384 |
{"role": "system", "content": system_prompt},
|
| 385 |
-
{"role": "user", "content": user_prompt}
|
| 386 |
]
|
| 387 |
|
| 388 |
response = client.chat.completions.create(
|
| 389 |
model=GROQ_MODEL,
|
| 390 |
messages=messages,
|
| 391 |
max_tokens=MAX_TOKENS,
|
| 392 |
-
temperature=TEMPERATURE
|
| 393 |
)
|
| 394 |
|
| 395 |
plan = response.choices[0].message.content
|
|
@@ -402,10 +414,11 @@ Create an execution plan to answer this question. Format as numbered steps."""
|
|
| 402 |
# Unified Planning Function with Fallback Chain
|
| 403 |
# ============================================================================
|
| 404 |
|
|
|
|
| 405 |
def plan_question(
|
| 406 |
question: str,
|
| 407 |
available_tools: Dict[str, Dict],
|
| 408 |
-
file_paths: Optional[List[str]] = None
|
| 409 |
) -> str:
|
| 410 |
"""
|
| 411 |
Analyze question and generate execution plan using LLM.
|
|
@@ -427,36 +440,47 @@ def plan_question(
|
|
| 427 |
lambda: plan_question_gemini(question, available_tools, file_paths)
|
| 428 |
)
|
| 429 |
except Exception as gemini_error:
|
| 430 |
-
logger.warning(
|
|
|
|
|
|
|
| 431 |
try:
|
| 432 |
return retry_with_backoff(
|
| 433 |
lambda: plan_question_hf(question, available_tools, file_paths)
|
| 434 |
)
|
| 435 |
except Exception as hf_error:
|
| 436 |
-
logger.warning(
|
|
|
|
|
|
|
| 437 |
try:
|
| 438 |
return retry_with_backoff(
|
| 439 |
lambda: plan_question_groq(question, available_tools, file_paths)
|
| 440 |
)
|
| 441 |
except Exception as groq_error:
|
| 442 |
-
logger.warning(
|
|
|
|
|
|
|
| 443 |
try:
|
| 444 |
return retry_with_backoff(
|
| 445 |
-
lambda: plan_question_claude(
|
|
|
|
|
|
|
| 446 |
)
|
| 447 |
except Exception as claude_error:
|
| 448 |
-
logger.error(
|
| 449 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 450 |
|
| 451 |
|
| 452 |
# ============================================================================
|
| 453 |
# Tool Selection - Claude Implementation
|
| 454 |
# ============================================================================
|
| 455 |
|
|
|
|
| 456 |
def select_tools_claude(
|
| 457 |
-
question: str,
|
| 458 |
-
plan: str,
|
| 459 |
-
available_tools: Dict[str, Dict]
|
| 460 |
) -> List[Dict[str, Any]]:
|
| 461 |
"""Use Claude function calling to select tools and extract parameters."""
|
| 462 |
client = create_claude_client()
|
|
@@ -464,15 +488,17 @@ def select_tools_claude(
|
|
| 464 |
# Convert tool registry to Claude function calling format
|
| 465 |
tool_schemas = []
|
| 466 |
for name, info in available_tools.items():
|
| 467 |
-
tool_schemas.append(
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
"
|
| 472 |
-
|
| 473 |
-
|
|
|
|
|
|
|
| 474 |
}
|
| 475 |
-
|
| 476 |
|
| 477 |
system_prompt = f"""You are a tool selection expert. Based on the question and execution plan, select appropriate tools with correct parameters.
|
| 478 |
|
|
@@ -491,7 +517,9 @@ Plan:
|
|
| 491 |
|
| 492 |
Select and call the tools needed according to the plan. Use exact parameter names from tool schemas."""
|
| 493 |
|
| 494 |
-
logger.info(
|
|
|
|
|
|
|
| 495 |
|
| 496 |
response = client.messages.create(
|
| 497 |
model=CLAUDE_MODEL,
|
|
@@ -499,18 +527,20 @@ Select and call the tools needed according to the plan. Use exact parameter name
|
|
| 499 |
temperature=TEMPERATURE,
|
| 500 |
system=system_prompt,
|
| 501 |
messages=[{"role": "user", "content": user_prompt}],
|
| 502 |
-
tools=tool_schemas
|
| 503 |
)
|
| 504 |
|
| 505 |
# Extract tool calls from response
|
| 506 |
tool_calls = []
|
| 507 |
for content_block in response.content:
|
| 508 |
if content_block.type == "tool_use":
|
| 509 |
-
tool_calls.append(
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
|
|
|
|
|
|
| 514 |
|
| 515 |
logger.info(f"[select_tools_claude] Claude selected {len(tool_calls)} tool(s)")
|
| 516 |
|
|
@@ -521,10 +551,9 @@ Select and call the tools needed according to the plan. Use exact parameter name
|
|
| 521 |
# Tool Selection - Gemini Implementation
|
| 522 |
# ============================================================================
|
| 523 |
|
|
|
|
| 524 |
def select_tools_gemini(
|
| 525 |
-
question: str,
|
| 526 |
-
plan: str,
|
| 527 |
-
available_tools: Dict[str, Dict]
|
| 528 |
) -> List[Dict[str, Any]]:
|
| 529 |
"""Use Gemini function calling to select tools and extract parameters."""
|
| 530 |
model = create_gemini_client()
|
|
@@ -532,25 +561,29 @@ def select_tools_gemini(
|
|
| 532 |
# Convert tool registry to Gemini function calling format
|
| 533 |
tools = []
|
| 534 |
for name, info in available_tools.items():
|
| 535 |
-
tools.append(
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 550 |
)
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
)
|
| 554 |
|
| 555 |
prompt = f"""You are a tool selection expert. Based on the question and execution plan, select appropriate tools with correct parameters.
|
| 556 |
|
|
@@ -569,27 +602,30 @@ Question: {question}
|
|
| 569 |
|
| 570 |
Select and call the tools needed according to the plan. Use exact parameter names from tool schemas."""
|
| 571 |
|
| 572 |
-
logger.info(
|
|
|
|
|
|
|
| 573 |
|
| 574 |
response = model.generate_content(
|
| 575 |
prompt,
|
| 576 |
tools=tools,
|
| 577 |
generation_config=genai.types.GenerationConfig(
|
| 578 |
-
temperature=TEMPERATURE,
|
| 579 |
-
|
| 580 |
-
)
|
| 581 |
)
|
| 582 |
|
| 583 |
# Extract tool calls from response
|
| 584 |
tool_calls = []
|
| 585 |
for part in response.parts:
|
| 586 |
-
if hasattr(part,
|
| 587 |
fc = part.function_call
|
| 588 |
-
tool_calls.append(
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
|
|
|
|
|
|
| 593 |
|
| 594 |
logger.info(f"[select_tools_gemini] Gemini selected {len(tool_calls)} tool(s)")
|
| 595 |
|
|
@@ -600,10 +636,9 @@ Select and call the tools needed according to the plan. Use exact parameter name
|
|
| 600 |
# Tool Selection - HuggingFace Implementation
|
| 601 |
# ============================================================================
|
| 602 |
|
|
|
|
| 603 |
def select_tools_hf(
|
| 604 |
-
question: str,
|
| 605 |
-
plan: str,
|
| 606 |
-
available_tools: Dict[str, Dict]
|
| 607 |
) -> List[Dict[str, Any]]:
|
| 608 |
"""Use HuggingFace Inference API with function calling to select tools and extract parameters."""
|
| 609 |
client = create_hf_client()
|
|
@@ -619,16 +654,16 @@ def select_tools_hf(
|
|
| 619 |
"parameters": {
|
| 620 |
"type": "object",
|
| 621 |
"properties": {},
|
| 622 |
-
"required": info.get("required_params", [])
|
| 623 |
-
}
|
| 624 |
-
}
|
| 625 |
}
|
| 626 |
|
| 627 |
# Add parameter schemas
|
| 628 |
for param_name, param_info in info.get("parameters", {}).items():
|
| 629 |
tool_schema["function"]["parameters"]["properties"][param_name] = {
|
| 630 |
"type": param_info.get("type", "string"),
|
| 631 |
-
"description": param_info.get("description", "")
|
| 632 |
}
|
| 633 |
|
| 634 |
tools.append(tool_schema)
|
|
@@ -650,31 +685,36 @@ Plan:
|
|
| 650 |
|
| 651 |
Select and call the tools needed according to the plan. Use exact parameter names from tool schemas."""
|
| 652 |
|
| 653 |
-
logger.info(
|
|
|
|
|
|
|
| 654 |
|
| 655 |
messages = [
|
| 656 |
{"role": "system", "content": system_prompt},
|
| 657 |
-
{"role": "user", "content": user_prompt}
|
| 658 |
]
|
| 659 |
|
| 660 |
# HuggingFace Inference API with tools parameter
|
| 661 |
response = client.chat_completion(
|
| 662 |
-
messages=messages,
|
| 663 |
-
tools=tools,
|
| 664 |
-
max_tokens=MAX_TOKENS,
|
| 665 |
-
temperature=TEMPERATURE
|
| 666 |
)
|
| 667 |
|
| 668 |
# Extract tool calls from response
|
| 669 |
tool_calls = []
|
| 670 |
-
if
|
|
|
|
|
|
|
|
|
|
| 671 |
for tool_call in response.choices[0].message.tool_calls:
|
| 672 |
import json
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
|
|
|
|
|
|
|
|
|
|
| 678 |
|
| 679 |
logger.info(f"[select_tools_hf] HuggingFace selected {len(tool_calls)} tool(s)")
|
| 680 |
|
|
@@ -685,10 +725,9 @@ Select and call the tools needed according to the plan. Use exact parameter name
|
|
| 685 |
# Tool Selection - Groq Implementation
|
| 686 |
# ============================================================================
|
| 687 |
|
|
|
|
| 688 |
def select_tools_groq(
|
| 689 |
-
question: str,
|
| 690 |
-
plan: str,
|
| 691 |
-
available_tools: Dict[str, Dict]
|
| 692 |
) -> List[Dict[str, Any]]:
|
| 693 |
"""Use Groq with function calling to select tools and extract parameters."""
|
| 694 |
client = create_groq_client()
|
|
@@ -704,16 +743,16 @@ def select_tools_groq(
|
|
| 704 |
"parameters": {
|
| 705 |
"type": "object",
|
| 706 |
"properties": {},
|
| 707 |
-
"required": info.get("required_params", [])
|
| 708 |
-
}
|
| 709 |
-
}
|
| 710 |
}
|
| 711 |
|
| 712 |
# Add parameter schemas
|
| 713 |
for param_name, param_info in info.get("parameters", {}).items():
|
| 714 |
tool_schema["function"]["parameters"]["properties"][param_name] = {
|
| 715 |
"type": param_info.get("type", "string"),
|
| 716 |
-
"description": param_info.get("description", "")
|
| 717 |
}
|
| 718 |
|
| 719 |
tools.append(tool_schema)
|
|
@@ -735,11 +774,13 @@ Plan:
|
|
| 735 |
|
| 736 |
Select and call the tools needed according to the plan. Use exact parameter names from tool schemas."""
|
| 737 |
|
| 738 |
-
logger.info(
|
|
|
|
|
|
|
| 739 |
|
| 740 |
messages = [
|
| 741 |
{"role": "system", "content": system_prompt},
|
| 742 |
-
{"role": "user", "content": user_prompt}
|
| 743 |
]
|
| 744 |
|
| 745 |
# Groq function calling
|
|
@@ -748,19 +789,25 @@ Select and call the tools needed according to the plan. Use exact parameter name
|
|
| 748 |
messages=messages,
|
| 749 |
tools=tools,
|
| 750 |
max_tokens=MAX_TOKENS,
|
| 751 |
-
temperature=TEMPERATURE
|
| 752 |
)
|
| 753 |
|
| 754 |
# Extract tool calls from response
|
| 755 |
tool_calls = []
|
| 756 |
-
if
|
|
|
|
|
|
|
|
|
|
| 757 |
for tool_call in response.choices[0].message.tool_calls:
|
| 758 |
import json
|
| 759 |
-
|
| 760 |
-
|
| 761 |
-
|
| 762 |
-
|
| 763 |
-
|
|
|
|
|
|
|
|
|
|
| 764 |
|
| 765 |
logger.info(f"[select_tools_groq] Groq selected {len(tool_calls)} tool(s)")
|
| 766 |
|
|
@@ -771,10 +818,9 @@ Select and call the tools needed according to the plan. Use exact parameter name
|
|
| 771 |
# Unified Tool Selection with Fallback Chain
|
| 772 |
# ============================================================================
|
| 773 |
|
|
|
|
| 774 |
def select_tools_with_function_calling(
|
| 775 |
-
question: str,
|
| 776 |
-
plan: str,
|
| 777 |
-
available_tools: Dict[str, Dict]
|
| 778 |
) -> List[Dict[str, Any]]:
|
| 779 |
"""
|
| 780 |
Use LLM function calling to dynamically select tools and extract parameters.
|
|
@@ -796,41 +842,51 @@ def select_tools_with_function_calling(
|
|
| 796 |
lambda: select_tools_gemini(question, plan, available_tools)
|
| 797 |
)
|
| 798 |
except Exception as gemini_error:
|
| 799 |
-
logger.warning(
|
|
|
|
|
|
|
| 800 |
try:
|
| 801 |
return retry_with_backoff(
|
| 802 |
lambda: select_tools_hf(question, plan, available_tools)
|
| 803 |
)
|
| 804 |
except Exception as hf_error:
|
| 805 |
-
logger.warning(
|
|
|
|
|
|
|
| 806 |
try:
|
| 807 |
return retry_with_backoff(
|
| 808 |
lambda: select_tools_groq(question, plan, available_tools)
|
| 809 |
)
|
| 810 |
except Exception as groq_error:
|
| 811 |
-
logger.warning(
|
|
|
|
|
|
|
| 812 |
try:
|
| 813 |
return retry_with_backoff(
|
| 814 |
lambda: select_tools_claude(question, plan, available_tools)
|
| 815 |
)
|
| 816 |
except Exception as claude_error:
|
| 817 |
-
logger.error(
|
| 818 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 819 |
|
| 820 |
|
| 821 |
# ============================================================================
|
| 822 |
# Answer Synthesis - Claude Implementation
|
| 823 |
# ============================================================================
|
| 824 |
|
| 825 |
-
|
| 826 |
-
|
| 827 |
-
evidence: List[str]
|
| 828 |
-
) -> str:
|
| 829 |
"""Synthesize factoid answer from evidence using Claude."""
|
| 830 |
client = create_claude_client()
|
| 831 |
|
| 832 |
# Format evidence
|
| 833 |
-
evidence_text = "\n\n".join(
|
|
|
|
|
|
|
| 834 |
|
| 835 |
system_prompt = """You are an answer synthesis agent for the GAIA benchmark.
|
| 836 |
|
|
@@ -867,7 +923,7 @@ Extract the factoid answer from the evidence above. Return only the factoid, not
|
|
| 867 |
max_tokens=256, # Factoid answers are short
|
| 868 |
temperature=TEMPERATURE,
|
| 869 |
system=system_prompt,
|
| 870 |
-
messages=[{"role": "user", "content": user_prompt}]
|
| 871 |
)
|
| 872 |
|
| 873 |
answer = response.content[0].text.strip()
|
|
@@ -880,15 +936,15 @@ Extract the factoid answer from the evidence above. Return only the factoid, not
|
|
| 880 |
# Answer Synthesis - Gemini Implementation
|
| 881 |
# ============================================================================
|
| 882 |
|
| 883 |
-
|
| 884 |
-
|
| 885 |
-
evidence: List[str]
|
| 886 |
-
) -> str:
|
| 887 |
"""Synthesize factoid answer from evidence using Gemini."""
|
| 888 |
model = create_gemini_client()
|
| 889 |
|
| 890 |
# Format evidence
|
| 891 |
-
evidence_text = "\n\n".join(
|
|
|
|
|
|
|
| 892 |
|
| 893 |
prompt = f"""You are an answer synthesis agent for the GAIA benchmark.
|
| 894 |
|
|
@@ -923,8 +979,8 @@ Extract the factoid answer from the evidence above. Return only the factoid, not
|
|
| 923 |
prompt,
|
| 924 |
generation_config=genai.types.GenerationConfig(
|
| 925 |
temperature=TEMPERATURE,
|
| 926 |
-
max_output_tokens=256 # Factoid answers are short
|
| 927 |
-
)
|
| 928 |
)
|
| 929 |
|
| 930 |
answer = response.text.strip()
|
|
@@ -937,15 +993,15 @@ Extract the factoid answer from the evidence above. Return only the factoid, not
|
|
| 937 |
# Answer Synthesis - HuggingFace Implementation
|
| 938 |
# ============================================================================
|
| 939 |
|
| 940 |
-
|
| 941 |
-
|
| 942 |
-
evidence: List[str]
|
| 943 |
-
) -> str:
|
| 944 |
"""Synthesize factoid answer from evidence using HuggingFace Inference API."""
|
| 945 |
client = create_hf_client()
|
| 946 |
|
| 947 |
# Format evidence
|
| 948 |
-
evidence_text = "\n\n".join(
|
|
|
|
|
|
|
| 949 |
|
| 950 |
system_prompt = """You are an answer synthesis agent for the GAIA benchmark.
|
| 951 |
|
|
@@ -979,13 +1035,13 @@ Extract the factoid answer from the evidence above. Return only the factoid, not
|
|
| 979 |
|
| 980 |
messages = [
|
| 981 |
{"role": "system", "content": system_prompt},
|
| 982 |
-
{"role": "user", "content": user_prompt}
|
| 983 |
]
|
| 984 |
|
| 985 |
response = client.chat_completion(
|
| 986 |
messages=messages,
|
| 987 |
max_tokens=256, # Factoid answers are short
|
| 988 |
-
temperature=TEMPERATURE
|
| 989 |
)
|
| 990 |
|
| 991 |
answer = response.choices[0].message.content.strip()
|
|
@@ -998,15 +1054,15 @@ Extract the factoid answer from the evidence above. Return only the factoid, not
|
|
| 998 |
# Answer Synthesis - Groq Implementation
|
| 999 |
# ============================================================================
|
| 1000 |
|
| 1001 |
-
|
| 1002 |
-
|
| 1003 |
-
evidence: List[str]
|
| 1004 |
-
) -> str:
|
| 1005 |
"""Synthesize factoid answer from evidence using Groq."""
|
| 1006 |
client = create_groq_client()
|
| 1007 |
|
| 1008 |
# Format evidence
|
| 1009 |
-
evidence_text = "\n\n".join(
|
|
|
|
|
|
|
| 1010 |
|
| 1011 |
system_prompt = """You are an answer synthesis agent for the GAIA benchmark.
|
| 1012 |
|
|
@@ -1040,14 +1096,14 @@ Extract the factoid answer from the evidence above. Return only the factoid, not
|
|
| 1040 |
|
| 1041 |
messages = [
|
| 1042 |
{"role": "system", "content": system_prompt},
|
| 1043 |
-
{"role": "user", "content": user_prompt}
|
| 1044 |
]
|
| 1045 |
|
| 1046 |
response = client.chat.completions.create(
|
| 1047 |
model=GROQ_MODEL,
|
| 1048 |
messages=messages,
|
| 1049 |
max_tokens=256, # Factoid answers are short
|
| 1050 |
-
temperature=TEMPERATURE
|
| 1051 |
)
|
| 1052 |
|
| 1053 |
answer = response.choices[0].message.content.strip()
|
|
@@ -1060,10 +1116,8 @@ Extract the factoid answer from the evidence above. Return only the factoid, not
|
|
| 1060 |
# Unified Answer Synthesis with Fallback Chain
|
| 1061 |
# ============================================================================
|
| 1062 |
|
| 1063 |
-
|
| 1064 |
-
|
| 1065 |
-
evidence: List[str]
|
| 1066 |
-
) -> str:
|
| 1067 |
"""
|
| 1068 |
Synthesize factoid answer from collected evidence using LLM.
|
| 1069 |
|
|
@@ -1079,36 +1133,43 @@ def synthesize_answer(
|
|
| 1079 |
Factoid answer string
|
| 1080 |
"""
|
| 1081 |
try:
|
| 1082 |
-
return retry_with_backoff(
|
| 1083 |
-
lambda: synthesize_answer_gemini(question, evidence)
|
| 1084 |
-
)
|
| 1085 |
except Exception as gemini_error:
|
| 1086 |
-
logger.warning(
|
|
|
|
|
|
|
| 1087 |
try:
|
| 1088 |
-
return retry_with_backoff(
|
| 1089 |
-
lambda: synthesize_answer_hf(question, evidence)
|
| 1090 |
-
)
|
| 1091 |
except Exception as hf_error:
|
| 1092 |
-
logger.warning(
|
|
|
|
|
|
|
| 1093 |
try:
|
| 1094 |
return retry_with_backoff(
|
| 1095 |
lambda: synthesize_answer_groq(question, evidence)
|
| 1096 |
)
|
| 1097 |
except Exception as groq_error:
|
| 1098 |
-
logger.warning(
|
|
|
|
|
|
|
| 1099 |
try:
|
| 1100 |
return retry_with_backoff(
|
| 1101 |
lambda: synthesize_answer_claude(question, evidence)
|
| 1102 |
)
|
| 1103 |
except Exception as claude_error:
|
| 1104 |
-
logger.error(
|
| 1105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1106 |
|
| 1107 |
|
| 1108 |
# ============================================================================
|
| 1109 |
# Conflict Resolution Functions
|
| 1110 |
# ============================================================================
|
| 1111 |
|
|
|
|
| 1112 |
def resolve_conflicts(evidence: List[str]) -> Dict[str, Any]:
|
| 1113 |
"""
|
| 1114 |
Detect and resolve conflicts in evidence using LLM reasoning.
|
|
@@ -1126,7 +1187,9 @@ def resolve_conflicts(evidence: List[str]) -> Dict[str, Any]:
|
|
| 1126 |
try:
|
| 1127 |
# Try Gemini first
|
| 1128 |
model = create_gemini_client()
|
| 1129 |
-
evidence_text = "\n\n".join(
|
|
|
|
|
|
|
| 1130 |
|
| 1131 |
prompt = f"""You are a conflict detection agent.
|
| 1132 |
|
|
@@ -1153,20 +1216,20 @@ Respond in JSON format:
|
|
| 1153 |
|
| 1154 |
response = model.generate_content(prompt)
|
| 1155 |
|
| 1156 |
-
result = {
|
| 1157 |
-
"has_conflicts": False,
|
| 1158 |
-
"conflicts": [],
|
| 1159 |
-
"resolution": response.text
|
| 1160 |
-
}
|
| 1161 |
|
| 1162 |
return result
|
| 1163 |
|
| 1164 |
except Exception as gemini_error:
|
| 1165 |
-
logger.warning(
|
|
|
|
|
|
|
| 1166 |
|
| 1167 |
# Fallback to Claude
|
| 1168 |
client = create_claude_client()
|
| 1169 |
-
evidence_text = "\n\n".join(
|
|
|
|
|
|
|
| 1170 |
|
| 1171 |
system_prompt = """You are a conflict detection agent.
|
| 1172 |
|
|
@@ -1194,13 +1257,13 @@ Respond in JSON format:
|
|
| 1194 |
max_tokens=MAX_TOKENS,
|
| 1195 |
temperature=TEMPERATURE,
|
| 1196 |
system=system_prompt,
|
| 1197 |
-
messages=[{"role": "user", "content": user_prompt}]
|
| 1198 |
)
|
| 1199 |
|
| 1200 |
result = {
|
| 1201 |
"has_conflicts": False,
|
| 1202 |
"conflicts": [],
|
| 1203 |
-
"resolution": response.content[0].text
|
| 1204 |
}
|
| 1205 |
|
| 1206 |
return result
|
|
|
|
| 38 |
# Alternatives: "meta-llama/Llama-3.1-70B-Instruct", "NousResearch/Hermes-3-Llama-3.1-70B"
|
| 39 |
|
| 40 |
# Groq Configuration
|
| 41 |
+
GROQ_MODEL = "qwen/qwen3-32b" # Free tier: 60 req/min, fast inference
|
| 42 |
# Alternatives: "llama-3.1-8b-instant", "mixtral-8x7b-32768"
|
| 43 |
|
| 44 |
# Shared Configuration
|
|
|
|
| 54 |
# Retry Logic with Exponential Backoff
|
| 55 |
# ============================================================================
|
| 56 |
|
| 57 |
+
|
| 58 |
def retry_with_backoff(func: Callable, max_retries: int = 3) -> Any:
|
| 59 |
"""
|
| 60 |
Retry function with exponential backoff on quota errors.
|
|
|
|
| 82 |
|
| 83 |
# Check if this is a quota/rate limit error
|
| 84 |
is_quota_error = (
|
| 85 |
+
"429" in str(e)
|
| 86 |
+
or "quota" in error_str
|
| 87 |
+
or "rate limit" in error_str
|
| 88 |
+
or "too many requests" in error_str
|
| 89 |
)
|
| 90 |
|
| 91 |
if is_quota_error and attempt < max_retries - 1:
|
| 92 |
# Exponential backoff: 1s, 2s, 4s
|
| 93 |
+
wait_time = 2**attempt
|
| 94 |
logger.warning(
|
| 95 |
f"Quota/rate limit error (attempt {attempt + 1}/{max_retries}): {e}. "
|
| 96 |
f"Retrying in {wait_time}s..."
|
|
|
|
| 101 |
# If not a quota error, or last attempt, raise immediately
|
| 102 |
raise
|
| 103 |
|
| 104 |
+
|
| 105 |
# ============================================================================
|
| 106 |
# Client Initialization
|
| 107 |
# ============================================================================
|
| 108 |
|
| 109 |
+
|
| 110 |
def create_claude_client() -> Anthropic:
|
| 111 |
"""Initialize Anthropic client with API key from environment."""
|
| 112 |
api_key = os.getenv("ANTHROPIC_API_KEY")
|
|
|
|
| 152 |
# Planning Functions - Claude Implementation
|
| 153 |
# ============================================================================
|
| 154 |
|
| 155 |
+
|
| 156 |
def plan_question_claude(
|
| 157 |
question: str,
|
| 158 |
available_tools: Dict[str, Dict],
|
| 159 |
+
file_paths: Optional[List[str]] = None,
|
| 160 |
) -> str:
|
| 161 |
"""Analyze question and generate execution plan using Claude."""
|
| 162 |
client = create_claude_client()
|
|
|
|
| 172 |
# File context
|
| 173 |
file_context = ""
|
| 174 |
if file_paths:
|
| 175 |
+
file_context = f"\n\nAvailable files:\n" + "\n".join(
|
| 176 |
+
[f"- {fp}" for fp in file_paths]
|
| 177 |
+
)
|
| 178 |
|
| 179 |
# Prompt for planning
|
| 180 |
system_prompt = """You are a planning agent for answering complex questions.
|
|
|
|
| 203 |
max_tokens=MAX_TOKENS,
|
| 204 |
temperature=TEMPERATURE,
|
| 205 |
system=system_prompt,
|
| 206 |
+
messages=[{"role": "user", "content": user_prompt}],
|
| 207 |
)
|
| 208 |
|
| 209 |
plan = response.content[0].text
|
|
|
|
| 216 |
# Planning Functions - Gemini Implementation
|
| 217 |
# ============================================================================
|
| 218 |
|
| 219 |
+
|
| 220 |
def plan_question_gemini(
|
| 221 |
question: str,
|
| 222 |
available_tools: Dict[str, Dict],
|
| 223 |
+
file_paths: Optional[List[str]] = None,
|
| 224 |
) -> str:
|
| 225 |
"""Analyze question and generate execution plan using Gemini."""
|
| 226 |
model = create_gemini_client()
|
|
|
|
| 236 |
# File context
|
| 237 |
file_context = ""
|
| 238 |
if file_paths:
|
| 239 |
+
file_context = f"\n\nAvailable files:\n" + "\n".join(
|
| 240 |
+
[f"- {fp}" for fp in file_paths]
|
| 241 |
+
)
|
| 242 |
|
| 243 |
# Combined prompt (Gemini doesn't use separate system/user like Claude)
|
| 244 |
prompt = f"""You are a planning agent for answering complex questions.
|
|
|
|
| 265 |
response = model.generate_content(
|
| 266 |
prompt,
|
| 267 |
generation_config=genai.types.GenerationConfig(
|
| 268 |
+
temperature=TEMPERATURE, max_output_tokens=MAX_TOKENS
|
| 269 |
+
),
|
|
|
|
| 270 |
)
|
| 271 |
|
| 272 |
plan = response.text
|
|
|
|
| 279 |
# Planning Functions - HuggingFace Implementation
|
| 280 |
# ============================================================================
|
| 281 |
|
| 282 |
+
|
| 283 |
def plan_question_hf(
|
| 284 |
question: str,
|
| 285 |
available_tools: Dict[str, Dict],
|
| 286 |
+
file_paths: Optional[List[str]] = None,
|
| 287 |
) -> str:
|
| 288 |
"""Analyze question and generate execution plan using HuggingFace Inference API."""
|
| 289 |
client = create_hf_client()
|
|
|
|
| 299 |
# File context
|
| 300 |
file_context = ""
|
| 301 |
if file_paths:
|
| 302 |
+
file_context = f"\n\nAvailable files:\n" + "\n".join(
|
| 303 |
+
[f"- {fp}" for fp in file_paths]
|
| 304 |
+
)
|
| 305 |
|
| 306 |
# System message for Qwen 2.5 (supports system/user format)
|
| 307 |
system_prompt = """You are a planning agent for answering complex questions.
|
|
|
|
| 328 |
# HuggingFace Inference API chat completion
|
| 329 |
messages = [
|
| 330 |
{"role": "system", "content": system_prompt},
|
| 331 |
+
{"role": "user", "content": user_prompt},
|
| 332 |
]
|
| 333 |
|
| 334 |
response = client.chat_completion(
|
| 335 |
+
messages=messages, max_tokens=MAX_TOKENS, temperature=TEMPERATURE
|
|
|
|
|
|
|
| 336 |
)
|
| 337 |
|
| 338 |
plan = response.choices[0].message.content
|
|
|
|
| 345 |
# Planning Functions - Groq Implementation
|
| 346 |
# ============================================================================
|
| 347 |
|
| 348 |
+
|
| 349 |
def plan_question_groq(
|
| 350 |
question: str,
|
| 351 |
available_tools: Dict[str, Dict],
|
| 352 |
+
file_paths: Optional[List[str]] = None,
|
| 353 |
) -> str:
|
| 354 |
"""Analyze question and generate execution plan using Groq."""
|
| 355 |
client = create_groq_client()
|
|
|
|
| 365 |
# File context
|
| 366 |
file_context = ""
|
| 367 |
if file_paths:
|
| 368 |
+
file_context = f"\n\nAvailable files:\n" + "\n".join(
|
| 369 |
+
[f"- {fp}" for fp in file_paths]
|
| 370 |
+
)
|
| 371 |
|
| 372 |
# System message for Llama 3.1 (supports system/user format)
|
| 373 |
system_prompt = """You are a planning agent for answering complex questions.
|
|
|
|
| 394 |
# Groq uses OpenAI-compatible API
|
| 395 |
messages = [
|
| 396 |
{"role": "system", "content": system_prompt},
|
| 397 |
+
{"role": "user", "content": user_prompt},
|
| 398 |
]
|
| 399 |
|
| 400 |
response = client.chat.completions.create(
|
| 401 |
model=GROQ_MODEL,
|
| 402 |
messages=messages,
|
| 403 |
max_tokens=MAX_TOKENS,
|
| 404 |
+
temperature=TEMPERATURE,
|
| 405 |
)
|
| 406 |
|
| 407 |
plan = response.choices[0].message.content
|
|
|
|
| 414 |
# Unified Planning Function with Fallback Chain
|
| 415 |
# ============================================================================
|
| 416 |
|
| 417 |
+
|
| 418 |
def plan_question(
|
| 419 |
question: str,
|
| 420 |
available_tools: Dict[str, Dict],
|
| 421 |
+
file_paths: Optional[List[str]] = None,
|
| 422 |
) -> str:
|
| 423 |
"""
|
| 424 |
Analyze question and generate execution plan using LLM.
|
|
|
|
| 440 |
lambda: plan_question_gemini(question, available_tools, file_paths)
|
| 441 |
)
|
| 442 |
except Exception as gemini_error:
|
| 443 |
+
logger.warning(
|
| 444 |
+
f"[plan_question] Gemini failed: {gemini_error}, trying HuggingFace fallback"
|
| 445 |
+
)
|
| 446 |
try:
|
| 447 |
return retry_with_backoff(
|
| 448 |
lambda: plan_question_hf(question, available_tools, file_paths)
|
| 449 |
)
|
| 450 |
except Exception as hf_error:
|
| 451 |
+
logger.warning(
|
| 452 |
+
f"[plan_question] HuggingFace failed: {hf_error}, trying Groq fallback"
|
| 453 |
+
)
|
| 454 |
try:
|
| 455 |
return retry_with_backoff(
|
| 456 |
lambda: plan_question_groq(question, available_tools, file_paths)
|
| 457 |
)
|
| 458 |
except Exception as groq_error:
|
| 459 |
+
logger.warning(
|
| 460 |
+
f"[plan_question] Groq failed: {groq_error}, trying Claude fallback"
|
| 461 |
+
)
|
| 462 |
try:
|
| 463 |
return retry_with_backoff(
|
| 464 |
+
lambda: plan_question_claude(
|
| 465 |
+
question, available_tools, file_paths
|
| 466 |
+
)
|
| 467 |
)
|
| 468 |
except Exception as claude_error:
|
| 469 |
+
logger.error(
|
| 470 |
+
f"[plan_question] All LLMs failed. Gemini: {gemini_error}, HF: {hf_error}, Groq: {groq_error}, Claude: {claude_error}"
|
| 471 |
+
)
|
| 472 |
+
raise Exception(
|
| 473 |
+
f"Planning failed with all LLMs. Gemini: {gemini_error}, HF: {hf_error}, Groq: {groq_error}, Claude: {claude_error}"
|
| 474 |
+
)
|
| 475 |
|
| 476 |
|
| 477 |
# ============================================================================
|
| 478 |
# Tool Selection - Claude Implementation
|
| 479 |
# ============================================================================
|
| 480 |
|
| 481 |
+
|
| 482 |
def select_tools_claude(
|
| 483 |
+
question: str, plan: str, available_tools: Dict[str, Dict]
|
|
|
|
|
|
|
| 484 |
) -> List[Dict[str, Any]]:
|
| 485 |
"""Use Claude function calling to select tools and extract parameters."""
|
| 486 |
client = create_claude_client()
|
|
|
|
| 488 |
# Convert tool registry to Claude function calling format
|
| 489 |
tool_schemas = []
|
| 490 |
for name, info in available_tools.items():
|
| 491 |
+
tool_schemas.append(
|
| 492 |
+
{
|
| 493 |
+
"name": name,
|
| 494 |
+
"description": info["description"],
|
| 495 |
+
"input_schema": {
|
| 496 |
+
"type": "object",
|
| 497 |
+
"properties": info.get("parameters", {}),
|
| 498 |
+
"required": info.get("required_params", []),
|
| 499 |
+
},
|
| 500 |
}
|
| 501 |
+
)
|
| 502 |
|
| 503 |
system_prompt = f"""You are a tool selection expert. Based on the question and execution plan, select appropriate tools with correct parameters.
|
| 504 |
|
|
|
|
| 517 |
|
| 518 |
Select and call the tools needed according to the plan. Use exact parameter names from tool schemas."""
|
| 519 |
|
| 520 |
+
logger.info(
|
| 521 |
+
f"[select_tools_claude] Calling Claude with function calling for {len(tool_schemas)} tools"
|
| 522 |
+
)
|
| 523 |
|
| 524 |
response = client.messages.create(
|
| 525 |
model=CLAUDE_MODEL,
|
|
|
|
| 527 |
temperature=TEMPERATURE,
|
| 528 |
system=system_prompt,
|
| 529 |
messages=[{"role": "user", "content": user_prompt}],
|
| 530 |
+
tools=tool_schemas,
|
| 531 |
)
|
| 532 |
|
| 533 |
# Extract tool calls from response
|
| 534 |
tool_calls = []
|
| 535 |
for content_block in response.content:
|
| 536 |
if content_block.type == "tool_use":
|
| 537 |
+
tool_calls.append(
|
| 538 |
+
{
|
| 539 |
+
"tool": content_block.name,
|
| 540 |
+
"params": content_block.input,
|
| 541 |
+
"id": content_block.id,
|
| 542 |
+
}
|
| 543 |
+
)
|
| 544 |
|
| 545 |
logger.info(f"[select_tools_claude] Claude selected {len(tool_calls)} tool(s)")
|
| 546 |
|
|
|
|
| 551 |
# Tool Selection - Gemini Implementation
|
| 552 |
# ============================================================================
|
| 553 |
|
| 554 |
+
|
| 555 |
def select_tools_gemini(
|
| 556 |
+
question: str, plan: str, available_tools: Dict[str, Dict]
|
|
|
|
|
|
|
| 557 |
) -> List[Dict[str, Any]]:
|
| 558 |
"""Use Gemini function calling to select tools and extract parameters."""
|
| 559 |
model = create_gemini_client()
|
|
|
|
| 561 |
# Convert tool registry to Gemini function calling format
|
| 562 |
tools = []
|
| 563 |
for name, info in available_tools.items():
|
| 564 |
+
tools.append(
|
| 565 |
+
genai.protos.Tool(
|
| 566 |
+
function_declarations=[
|
| 567 |
+
genai.protos.FunctionDeclaration(
|
| 568 |
+
name=name,
|
| 569 |
+
description=info["description"],
|
| 570 |
+
parameters=genai.protos.Schema(
|
| 571 |
+
type=genai.protos.Type.OBJECT,
|
| 572 |
+
properties={
|
| 573 |
+
param_name: genai.protos.Schema(
|
| 574 |
+
type=genai.protos.Type.STRING,
|
| 575 |
+
description=param_info.get("description", ""),
|
| 576 |
+
)
|
| 577 |
+
for param_name, param_info in info.get(
|
| 578 |
+
"parameters", {}
|
| 579 |
+
).items()
|
| 580 |
+
},
|
| 581 |
+
required=info.get("required_params", []),
|
| 582 |
+
),
|
| 583 |
)
|
| 584 |
+
]
|
| 585 |
+
)
|
| 586 |
+
)
|
| 587 |
|
| 588 |
prompt = f"""You are a tool selection expert. Based on the question and execution plan, select appropriate tools with correct parameters.
|
| 589 |
|
|
|
|
| 602 |
|
| 603 |
Select and call the tools needed according to the plan. Use exact parameter names from tool schemas."""
|
| 604 |
|
| 605 |
+
logger.info(
|
| 606 |
+
f"[select_tools_gemini] Calling Gemini with function calling for {len(available_tools)} tools"
|
| 607 |
+
)
|
| 608 |
|
| 609 |
response = model.generate_content(
|
| 610 |
prompt,
|
| 611 |
tools=tools,
|
| 612 |
generation_config=genai.types.GenerationConfig(
|
| 613 |
+
temperature=TEMPERATURE, max_output_tokens=MAX_TOKENS
|
| 614 |
+
),
|
|
|
|
| 615 |
)
|
| 616 |
|
| 617 |
# Extract tool calls from response
|
| 618 |
tool_calls = []
|
| 619 |
for part in response.parts:
|
| 620 |
+
if hasattr(part, "function_call") and part.function_call:
|
| 621 |
fc = part.function_call
|
| 622 |
+
tool_calls.append(
|
| 623 |
+
{
|
| 624 |
+
"tool": fc.name,
|
| 625 |
+
"params": dict(fc.args),
|
| 626 |
+
"id": f"gemini_{len(tool_calls)}",
|
| 627 |
+
}
|
| 628 |
+
)
|
| 629 |
|
| 630 |
logger.info(f"[select_tools_gemini] Gemini selected {len(tool_calls)} tool(s)")
|
| 631 |
|
|
|
|
| 636 |
# Tool Selection - HuggingFace Implementation
|
| 637 |
# ============================================================================
|
| 638 |
|
| 639 |
+
|
| 640 |
def select_tools_hf(
|
| 641 |
+
question: str, plan: str, available_tools: Dict[str, Dict]
|
|
|
|
|
|
|
| 642 |
) -> List[Dict[str, Any]]:
|
| 643 |
"""Use HuggingFace Inference API with function calling to select tools and extract parameters."""
|
| 644 |
client = create_hf_client()
|
|
|
|
| 654 |
"parameters": {
|
| 655 |
"type": "object",
|
| 656 |
"properties": {},
|
| 657 |
+
"required": info.get("required_params", []),
|
| 658 |
+
},
|
| 659 |
+
},
|
| 660 |
}
|
| 661 |
|
| 662 |
# Add parameter schemas
|
| 663 |
for param_name, param_info in info.get("parameters", {}).items():
|
| 664 |
tool_schema["function"]["parameters"]["properties"][param_name] = {
|
| 665 |
"type": param_info.get("type", "string"),
|
| 666 |
+
"description": param_info.get("description", ""),
|
| 667 |
}
|
| 668 |
|
| 669 |
tools.append(tool_schema)
|
|
|
|
| 685 |
|
| 686 |
Select and call the tools needed according to the plan. Use exact parameter names from tool schemas."""
|
| 687 |
|
| 688 |
+
logger.info(
|
| 689 |
+
f"[select_tools_hf] Calling HuggingFace with function calling for {len(tools)} tools"
|
| 690 |
+
)
|
| 691 |
|
| 692 |
messages = [
|
| 693 |
{"role": "system", "content": system_prompt},
|
| 694 |
+
{"role": "user", "content": user_prompt},
|
| 695 |
]
|
| 696 |
|
| 697 |
# HuggingFace Inference API with tools parameter
|
| 698 |
response = client.chat_completion(
|
| 699 |
+
messages=messages, tools=tools, max_tokens=MAX_TOKENS, temperature=TEMPERATURE
|
|
|
|
|
|
|
|
|
|
| 700 |
)
|
| 701 |
|
| 702 |
# Extract tool calls from response
|
| 703 |
tool_calls = []
|
| 704 |
+
if (
|
| 705 |
+
hasattr(response.choices[0].message, "tool_calls")
|
| 706 |
+
and response.choices[0].message.tool_calls
|
| 707 |
+
):
|
| 708 |
for tool_call in response.choices[0].message.tool_calls:
|
| 709 |
import json
|
| 710 |
+
|
| 711 |
+
tool_calls.append(
|
| 712 |
+
{
|
| 713 |
+
"tool": tool_call.function.name,
|
| 714 |
+
"params": json.loads(tool_call.function.arguments),
|
| 715 |
+
"id": tool_call.id,
|
| 716 |
+
}
|
| 717 |
+
)
|
| 718 |
|
| 719 |
logger.info(f"[select_tools_hf] HuggingFace selected {len(tool_calls)} tool(s)")
|
| 720 |
|
|
|
|
| 725 |
# Tool Selection - Groq Implementation
|
| 726 |
# ============================================================================
|
| 727 |
|
| 728 |
+
|
| 729 |
def select_tools_groq(
|
| 730 |
+
question: str, plan: str, available_tools: Dict[str, Dict]
|
|
|
|
|
|
|
| 731 |
) -> List[Dict[str, Any]]:
|
| 732 |
"""Use Groq with function calling to select tools and extract parameters."""
|
| 733 |
client = create_groq_client()
|
|
|
|
| 743 |
"parameters": {
|
| 744 |
"type": "object",
|
| 745 |
"properties": {},
|
| 746 |
+
"required": info.get("required_params", []),
|
| 747 |
+
},
|
| 748 |
+
},
|
| 749 |
}
|
| 750 |
|
| 751 |
# Add parameter schemas
|
| 752 |
for param_name, param_info in info.get("parameters", {}).items():
|
| 753 |
tool_schema["function"]["parameters"]["properties"][param_name] = {
|
| 754 |
"type": param_info.get("type", "string"),
|
| 755 |
+
"description": param_info.get("description", ""),
|
| 756 |
}
|
| 757 |
|
| 758 |
tools.append(tool_schema)
|
|
|
|
| 774 |
|
| 775 |
Select and call the tools needed according to the plan. Use exact parameter names from tool schemas."""
|
| 776 |
|
| 777 |
+
logger.info(
|
| 778 |
+
f"[select_tools_groq] Calling Groq with function calling for {len(tools)} tools"
|
| 779 |
+
)
|
| 780 |
|
| 781 |
messages = [
|
| 782 |
{"role": "system", "content": system_prompt},
|
| 783 |
+
{"role": "user", "content": user_prompt},
|
| 784 |
]
|
| 785 |
|
| 786 |
# Groq function calling
|
|
|
|
| 789 |
messages=messages,
|
| 790 |
tools=tools,
|
| 791 |
max_tokens=MAX_TOKENS,
|
| 792 |
+
temperature=TEMPERATURE,
|
| 793 |
)
|
| 794 |
|
| 795 |
# Extract tool calls from response
|
| 796 |
tool_calls = []
|
| 797 |
+
if (
|
| 798 |
+
hasattr(response.choices[0].message, "tool_calls")
|
| 799 |
+
and response.choices[0].message.tool_calls
|
| 800 |
+
):
|
| 801 |
for tool_call in response.choices[0].message.tool_calls:
|
| 802 |
import json
|
| 803 |
+
|
| 804 |
+
tool_calls.append(
|
| 805 |
+
{
|
| 806 |
+
"tool": tool_call.function.name,
|
| 807 |
+
"params": json.loads(tool_call.function.arguments),
|
| 808 |
+
"id": tool_call.id,
|
| 809 |
+
}
|
| 810 |
+
)
|
| 811 |
|
| 812 |
logger.info(f"[select_tools_groq] Groq selected {len(tool_calls)} tool(s)")
|
| 813 |
|
|
|
|
| 818 |
# Unified Tool Selection with Fallback Chain
|
| 819 |
# ============================================================================
|
| 820 |
|
| 821 |
+
|
| 822 |
def select_tools_with_function_calling(
|
| 823 |
+
question: str, plan: str, available_tools: Dict[str, Dict]
|
|
|
|
|
|
|
| 824 |
) -> List[Dict[str, Any]]:
|
| 825 |
"""
|
| 826 |
Use LLM function calling to dynamically select tools and extract parameters.
|
|
|
|
| 842 |
lambda: select_tools_gemini(question, plan, available_tools)
|
| 843 |
)
|
| 844 |
except Exception as gemini_error:
|
| 845 |
+
logger.warning(
|
| 846 |
+
f"[select_tools] Gemini failed: {gemini_error}, trying HuggingFace fallback"
|
| 847 |
+
)
|
| 848 |
try:
|
| 849 |
return retry_with_backoff(
|
| 850 |
lambda: select_tools_hf(question, plan, available_tools)
|
| 851 |
)
|
| 852 |
except Exception as hf_error:
|
| 853 |
+
logger.warning(
|
| 854 |
+
f"[select_tools] HuggingFace failed: {hf_error}, trying Groq fallback"
|
| 855 |
+
)
|
| 856 |
try:
|
| 857 |
return retry_with_backoff(
|
| 858 |
lambda: select_tools_groq(question, plan, available_tools)
|
| 859 |
)
|
| 860 |
except Exception as groq_error:
|
| 861 |
+
logger.warning(
|
| 862 |
+
f"[select_tools] Groq failed: {groq_error}, trying Claude fallback"
|
| 863 |
+
)
|
| 864 |
try:
|
| 865 |
return retry_with_backoff(
|
| 866 |
lambda: select_tools_claude(question, plan, available_tools)
|
| 867 |
)
|
| 868 |
except Exception as claude_error:
|
| 869 |
+
logger.error(
|
| 870 |
+
f"[select_tools] All LLMs failed. Gemini: {gemini_error}, HF: {hf_error}, Groq: {groq_error}, Claude: {claude_error}"
|
| 871 |
+
)
|
| 872 |
+
raise Exception(
|
| 873 |
+
f"Tool selection failed with all LLMs. Gemini: {gemini_error}, HF: {hf_error}, Groq: {groq_error}, Claude: {claude_error}"
|
| 874 |
+
)
|
| 875 |
|
| 876 |
|
| 877 |
# ============================================================================
|
| 878 |
# Answer Synthesis - Claude Implementation
|
| 879 |
# ============================================================================
|
| 880 |
|
| 881 |
+
|
| 882 |
+
def synthesize_answer_claude(question: str, evidence: List[str]) -> str:
|
|
|
|
|
|
|
| 883 |
"""Synthesize factoid answer from evidence using Claude."""
|
| 884 |
client = create_claude_client()
|
| 885 |
|
| 886 |
# Format evidence
|
| 887 |
+
evidence_text = "\n\n".join(
|
| 888 |
+
[f"Evidence {i + 1}:\n{e}" for i, e in enumerate(evidence)]
|
| 889 |
+
)
|
| 890 |
|
| 891 |
system_prompt = """You are an answer synthesis agent for the GAIA benchmark.
|
| 892 |
|
|
|
|
| 923 |
max_tokens=256, # Factoid answers are short
|
| 924 |
temperature=TEMPERATURE,
|
| 925 |
system=system_prompt,
|
| 926 |
+
messages=[{"role": "user", "content": user_prompt}],
|
| 927 |
)
|
| 928 |
|
| 929 |
answer = response.content[0].text.strip()
|
|
|
|
| 936 |
# Answer Synthesis - Gemini Implementation
|
| 937 |
# ============================================================================
|
| 938 |
|
| 939 |
+
|
| 940 |
+
def synthesize_answer_gemini(question: str, evidence: List[str]) -> str:
|
|
|
|
|
|
|
| 941 |
"""Synthesize factoid answer from evidence using Gemini."""
|
| 942 |
model = create_gemini_client()
|
| 943 |
|
| 944 |
# Format evidence
|
| 945 |
+
evidence_text = "\n\n".join(
|
| 946 |
+
[f"Evidence {i + 1}:\n{e}" for i, e in enumerate(evidence)]
|
| 947 |
+
)
|
| 948 |
|
| 949 |
prompt = f"""You are an answer synthesis agent for the GAIA benchmark.
|
| 950 |
|
|
|
|
| 979 |
prompt,
|
| 980 |
generation_config=genai.types.GenerationConfig(
|
| 981 |
temperature=TEMPERATURE,
|
| 982 |
+
max_output_tokens=256, # Factoid answers are short
|
| 983 |
+
),
|
| 984 |
)
|
| 985 |
|
| 986 |
answer = response.text.strip()
|
|
|
|
| 993 |
# Answer Synthesis - HuggingFace Implementation
|
| 994 |
# ============================================================================
|
| 995 |
|
| 996 |
+
|
| 997 |
+
def synthesize_answer_hf(question: str, evidence: List[str]) -> str:
|
|
|
|
|
|
|
| 998 |
"""Synthesize factoid answer from evidence using HuggingFace Inference API."""
|
| 999 |
client = create_hf_client()
|
| 1000 |
|
| 1001 |
# Format evidence
|
| 1002 |
+
evidence_text = "\n\n".join(
|
| 1003 |
+
[f"Evidence {i + 1}:\n{e}" for i, e in enumerate(evidence)]
|
| 1004 |
+
)
|
| 1005 |
|
| 1006 |
system_prompt = """You are an answer synthesis agent for the GAIA benchmark.
|
| 1007 |
|
|
|
|
| 1035 |
|
| 1036 |
messages = [
|
| 1037 |
{"role": "system", "content": system_prompt},
|
| 1038 |
+
{"role": "user", "content": user_prompt},
|
| 1039 |
]
|
| 1040 |
|
| 1041 |
response = client.chat_completion(
|
| 1042 |
messages=messages,
|
| 1043 |
max_tokens=256, # Factoid answers are short
|
| 1044 |
+
temperature=TEMPERATURE,
|
| 1045 |
)
|
| 1046 |
|
| 1047 |
answer = response.choices[0].message.content.strip()
|
|
|
|
| 1054 |
# Answer Synthesis - Groq Implementation
|
| 1055 |
# ============================================================================
|
| 1056 |
|
| 1057 |
+
|
| 1058 |
+
def synthesize_answer_groq(question: str, evidence: List[str]) -> str:
|
|
|
|
|
|
|
| 1059 |
"""Synthesize factoid answer from evidence using Groq."""
|
| 1060 |
client = create_groq_client()
|
| 1061 |
|
| 1062 |
# Format evidence
|
| 1063 |
+
evidence_text = "\n\n".join(
|
| 1064 |
+
[f"Evidence {i + 1}:\n{e}" for i, e in enumerate(evidence)]
|
| 1065 |
+
)
|
| 1066 |
|
| 1067 |
system_prompt = """You are an answer synthesis agent for the GAIA benchmark.
|
| 1068 |
|
|
|
|
| 1096 |
|
| 1097 |
messages = [
|
| 1098 |
{"role": "system", "content": system_prompt},
|
| 1099 |
+
{"role": "user", "content": user_prompt},
|
| 1100 |
]
|
| 1101 |
|
| 1102 |
response = client.chat.completions.create(
|
| 1103 |
model=GROQ_MODEL,
|
| 1104 |
messages=messages,
|
| 1105 |
max_tokens=256, # Factoid answers are short
|
| 1106 |
+
temperature=TEMPERATURE,
|
| 1107 |
)
|
| 1108 |
|
| 1109 |
answer = response.choices[0].message.content.strip()
|
|
|
|
| 1116 |
# Unified Answer Synthesis with Fallback Chain
|
| 1117 |
# ============================================================================
|
| 1118 |
|
| 1119 |
+
|
| 1120 |
+
def synthesize_answer(question: str, evidence: List[str]) -> str:
|
|
|
|
|
|
|
| 1121 |
"""
|
| 1122 |
Synthesize factoid answer from collected evidence using LLM.
|
| 1123 |
|
|
|
|
| 1133 |
Factoid answer string
|
| 1134 |
"""
|
| 1135 |
try:
|
| 1136 |
+
return retry_with_backoff(lambda: synthesize_answer_gemini(question, evidence))
|
|
|
|
|
|
|
| 1137 |
except Exception as gemini_error:
|
| 1138 |
+
logger.warning(
|
| 1139 |
+
f"[synthesize_answer] Gemini failed: {gemini_error}, trying HuggingFace fallback"
|
| 1140 |
+
)
|
| 1141 |
try:
|
| 1142 |
+
return retry_with_backoff(lambda: synthesize_answer_hf(question, evidence))
|
|
|
|
|
|
|
| 1143 |
except Exception as hf_error:
|
| 1144 |
+
logger.warning(
|
| 1145 |
+
f"[synthesize_answer] HuggingFace failed: {hf_error}, trying Groq fallback"
|
| 1146 |
+
)
|
| 1147 |
try:
|
| 1148 |
return retry_with_backoff(
|
| 1149 |
lambda: synthesize_answer_groq(question, evidence)
|
| 1150 |
)
|
| 1151 |
except Exception as groq_error:
|
| 1152 |
+
logger.warning(
|
| 1153 |
+
f"[synthesize_answer] Groq failed: {groq_error}, trying Claude fallback"
|
| 1154 |
+
)
|
| 1155 |
try:
|
| 1156 |
return retry_with_backoff(
|
| 1157 |
lambda: synthesize_answer_claude(question, evidence)
|
| 1158 |
)
|
| 1159 |
except Exception as claude_error:
|
| 1160 |
+
logger.error(
|
| 1161 |
+
f"[synthesize_answer] All LLMs failed. Gemini: {gemini_error}, HF: {hf_error}, Groq: {groq_error}, Claude: {claude_error}"
|
| 1162 |
+
)
|
| 1163 |
+
raise Exception(
|
| 1164 |
+
f"Answer synthesis failed with all LLMs. Gemini: {gemini_error}, HF: {hf_error}, Groq: {groq_error}, Claude: {claude_error}"
|
| 1165 |
+
)
|
| 1166 |
|
| 1167 |
|
| 1168 |
# ============================================================================
|
| 1169 |
# Conflict Resolution Functions
|
| 1170 |
# ============================================================================
|
| 1171 |
|
| 1172 |
+
|
| 1173 |
def resolve_conflicts(evidence: List[str]) -> Dict[str, Any]:
|
| 1174 |
"""
|
| 1175 |
Detect and resolve conflicts in evidence using LLM reasoning.
|
|
|
|
| 1187 |
try:
|
| 1188 |
# Try Gemini first
|
| 1189 |
model = create_gemini_client()
|
| 1190 |
+
evidence_text = "\n\n".join(
|
| 1191 |
+
[f"Evidence {i + 1}:\n{e}" for i, e in enumerate(evidence)]
|
| 1192 |
+
)
|
| 1193 |
|
| 1194 |
prompt = f"""You are a conflict detection agent.
|
| 1195 |
|
|
|
|
| 1216 |
|
| 1217 |
response = model.generate_content(prompt)
|
| 1218 |
|
| 1219 |
+
result = {"has_conflicts": False, "conflicts": [], "resolution": response.text}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1220 |
|
| 1221 |
return result
|
| 1222 |
|
| 1223 |
except Exception as gemini_error:
|
| 1224 |
+
logger.warning(
|
| 1225 |
+
f"[resolve_conflicts] Gemini failed: {gemini_error}, trying Claude"
|
| 1226 |
+
)
|
| 1227 |
|
| 1228 |
# Fallback to Claude
|
| 1229 |
client = create_claude_client()
|
| 1230 |
+
evidence_text = "\n\n".join(
|
| 1231 |
+
[f"Evidence {i + 1}:\n{e}" for i, e in enumerate(evidence)]
|
| 1232 |
+
)
|
| 1233 |
|
| 1234 |
system_prompt = """You are a conflict detection agent.
|
| 1235 |
|
|
|
|
| 1257 |
max_tokens=MAX_TOKENS,
|
| 1258 |
temperature=TEMPERATURE,
|
| 1259 |
system=system_prompt,
|
| 1260 |
+
messages=[{"role": "user", "content": user_prompt}],
|
| 1261 |
)
|
| 1262 |
|
| 1263 |
result = {
|
| 1264 |
"has_conflicts": False,
|
| 1265 |
"conflicts": [],
|
| 1266 |
+
"resolution": response.content[0].text,
|
| 1267 |
}
|
| 1268 |
|
| 1269 |
return result
|