Chris commited on
Commit
7ef24ef
·
1 Parent(s): 6dce4fa

Final 6.0.3

Browse files
src/app.py CHANGED
@@ -220,57 +220,38 @@ class GAIAResultLogger:
220
  return files[:10] # Return 10 most recent
221
 
222
  class GAIAAgentApp:
223
- """Production GAIA Agent Application with Unit 4 API integration"""
224
 
225
  def __init__(self, hf_token: Optional[str] = None):
226
- """Initialize the application with optional HF token"""
227
 
228
  # Priority order: 1) passed hf_token, 2) HF_TOKEN env var
229
  if not hf_token:
230
  hf_token = os.getenv("HF_TOKEN")
231
 
 
 
 
232
  try:
233
- # Try main QwenClient first
234
  from models.qwen_client import QwenClient
235
  self.llm_client = QwenClient(hf_token=hf_token)
236
- self.workflow = SimpleGAIAWorkflow(self.llm_client)
237
 
238
- # Test if client is working with a simple generation
239
- test_result = self.llm_client.generate("What is 2+2?", max_tokens=10)
240
- if not test_result.success or not test_result.response.strip():
241
- logger.error(f"❌ Main client test failed: {test_result}")
242
- raise Exception("Main client not working - no valid response generated")
243
 
244
  self.initialized = True
245
- logger.info("✅ GAIA Agent system initialized with main client")
246
 
247
  except Exception as e:
248
- logger.warning(f"⚠️ Main client failed ({e})")
249
-
250
- # Always try SimpleClient fallback when main models fail
251
- logger.warning("⚠️ Attempting SimpleClient fallback...")
252
- try:
253
- # Fallback to simple client
254
- from models.simple_client import SimpleClient
255
- self.llm_client = SimpleClient(hf_token=hf_token)
256
- self.workflow = SimpleGAIAWorkflow(self.llm_client)
257
-
258
- # Test simple client
259
- test_result = self.llm_client.generate("What is 2+2?", max_tokens=10)
260
- if test_result.success and test_result.response.strip():
261
- self.initialized = True
262
- logger.info("✅ GAIA Agent system initialized with SimpleClient fallback")
263
- else:
264
- logger.error("❌ SimpleClient also failed to generate responses")
265
- self.initialized = False
266
-
267
- except Exception as fallback_error:
268
- logger.error(f"❌ SimpleClient fallback also failed: {fallback_error}")
269
- self.initialized = False
270
 
271
  @classmethod
272
  def create_with_oauth_token(cls, oauth_token: str) -> "GAIAAgentApp":
273
  """Create a new instance with OAuth token"""
 
 
274
  return cls(hf_token=oauth_token)
275
 
276
  def __call__(self, question: str) -> str:
@@ -428,13 +409,14 @@ class GAIAAgentApp:
428
  return "\n".join(reasoning)
429
 
430
  def get_examples(self) -> list:
431
- """Get example questions for the interface"""
432
  return [
433
- "What is the capital of France?",
434
- "Calculate 25% of 200",
435
- "What is the square root of 144?",
436
- "What is the average of 10, 15, and 20?",
437
  "How many studio albums were published by Mercedes Sosa between 2000 and 2009?",
 
 
 
 
 
438
  ]
439
 
440
  def check_oauth_scopes(oauth_token: str) -> Dict[str, any]:
@@ -524,32 +506,35 @@ def format_auth_status(profile: gr.OAuthProfile | None) -> str:
524
  **🚀 FULL SYSTEM CAPABILITIES ENABLED**
525
 
526
  **Authentication Source**: HF_TOKEN environment variable
527
- **Scopes**: read, inference (full access)
 
528
 
529
  **Available Features:**
530
  - ✅ **Advanced Model Access**: Full Qwen model capabilities (7B/32B/72B)
531
  - ✅ **High Performance**: 30%+ expected GAIA score
532
- - ✅ **Complete Pipeline**: All agents and tools fully functional
533
- - ✅ **Web Research**: Full DuckDuckGo search capabilities
534
- - ✅ **File Processing**: Complete multi-format file handling
535
- - ✅ **Manual Testing**: Individual question processing
536
  - ✅ **Official Evaluation**: GAIA benchmark submission
537
 
538
- 💡 **Status**: Optimal configuration for GAIA benchmark performance.
539
  """
540
 
541
  if not profile:
542
  return """
543
  ### 🔐 Authentication Status: Not Logged In
544
 
545
- Please log in to access GAIA evaluation features with full inference access.
546
 
547
- **What you can do:**
548
- - Manual question testing (limited functionality)
549
- - Official GAIA benchmark evaluation (requires login)
 
550
 
551
- **🔑 OAuth Configuration**: Login now requests both `read` and `inference` scopes for optimal performance.
552
- **📈 Expected Performance**: 30%+ GAIA score with full inference access.
 
553
  """
554
 
555
  username = profile.username
@@ -568,7 +553,7 @@ Please log in to access GAIA evaluation features with full inference access.
568
  scopes = scope_info.get("scopes", [])
569
  status_parts.append(f"**Detected Scopes**: {', '.join(scopes) if scopes else 'None detected'}")
570
  status_parts.append("")
571
- status_parts.append("**Available Features:**")
572
 
573
  # Safely access capabilities
574
  can_inference = scope_info.get("can_inference", False)
@@ -576,46 +561,50 @@ Please log in to access GAIA evaluation features with full inference access.
576
 
577
  if can_inference:
578
  status_parts.extend([
579
- "- ✅ **Advanced Model Access**: Full Qwen model capabilities",
580
  "- ✅ **High Performance**: 30%+ expected GAIA score",
581
- "- ✅ **Complete Pipeline**: All agents and tools fully functional",
 
 
582
  "- ✅ **Inference Access**: Full model generation capabilities"
583
  ])
584
  else:
585
  status_parts.extend([
586
- "- ⚠️ **Limited Model Access**: Using fallback SimpleClient",
587
- "- ⚠️ **Basic Performance**: 15%+ expected GAIA score",
588
- "- **Reliable Responses**: Rule-based answers for common questions",
589
- "- ❌ **No Inference Access**: Limited to read-only operations"
590
  ])
591
 
592
  if can_read:
593
  status_parts.append("- ✅ **Profile Access**: Can read user information")
594
 
595
  status_parts.extend([
596
- "- ✅ **Manual Testing**: Individual question processing",
597
- "- ✅ **Official Evaluation**: GAIA benchmark submission"
598
  ])
599
 
600
  if not can_inference:
601
  status_parts.extend([
602
  "",
603
- "🔑 **Note**: Your OAuth session may have limited scopes.",
604
- "**Solution**: Try logging out and logging back in to request full inference access.",
605
- "**Alternative**: Set HF_TOKEN as a Space secret for guaranteed full access."
 
606
  ])
607
  else:
608
  status_parts.extend([
609
  "",
610
- "🎉 **Excellent**: You have full inference access for optimal performance!"
 
611
  ])
612
 
613
  return "\n".join(status_parts)
614
 
615
  def run_and_submit_all(profile: gr.OAuthProfile | None):
616
  """
617
- Fetches all questions from Unit 4 API, runs the GAIA Agent on them, submits all answers,
618
- and displays the results. Also returns updated authentication status and downloadable files.
619
  """
620
  start_time = time.time()
621
 
@@ -634,7 +623,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
634
  username = "unknown_user"
635
 
636
  if hf_token:
637
- logger.info("🎯 Using HF_TOKEN environment variable for authentication")
638
  oauth_token = hf_token
639
  username = "hf_token_user"
640
  elif profile:
@@ -649,8 +638,8 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
649
  test_response = requests.get("https://huggingface.co/api/whoami", headers=headers, timeout=5)
650
 
651
  if test_response.status_code == 401:
652
- logger.warning("⚠️ OAuth token has insufficient scopes for model inference")
653
- oauth_token = None # Force fallback to SimpleClient
654
  elif test_response.status_code == 200:
655
  logger.info("✅ OAuth token validated successfully")
656
  else:
@@ -659,27 +648,33 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
659
  except Exception as e:
660
  logger.warning(f"⚠️ Could not validate OAuth token: {e}")
661
  else:
662
- logger.info("User not logged in and no HF_TOKEN available.")
663
- return "Please either login to Hugging Face or set HF_TOKEN environment variable.", None, auth_status, None, None, None
664
 
665
  if not oauth_token:
666
- return "No valid authentication token available. Please login or set HF_TOKEN environment variable.", None, auth_status, None, None, None
667
 
668
  api_url = DEFAULT_API_URL
669
  questions_url = f"{api_url}/questions"
670
  submit_url = f"{api_url}/submit"
671
 
672
- # 1. Instantiate GAIA Agent with token
673
  try:
674
- logger.info("🚀 Creating GAIA Agent with authenticated token")
675
  agent = GAIAAgentApp.create_with_oauth_token(oauth_token)
676
 
677
  if not agent.initialized:
678
- return "Error: GAIA Agent failed to initialize", None, auth_status, None, None, None
 
 
 
 
 
 
679
  except Exception as e:
680
- logger.error(f"Error instantiating agent: {e}")
681
- return f"Error initializing GAIA Agent: {e}", None, auth_status, None, None, None
682
-
683
  # Agent code URL
684
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "Local Development"
685
  logger.info(f"Agent code URL: {agent_code}")
@@ -811,7 +806,8 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
811
  def create_interface():
812
  """Create the Gradio interface with both Unit 4 API and manual testing"""
813
 
814
- app = GAIAAgentApp()
 
815
 
816
  # Custom CSS for better styling
817
  css = """
@@ -1097,8 +1093,10 @@ def create_interface():
1097
 
1098
  **Advanced Multi-Agent AI System for GAIA Benchmark Questions**
1099
 
1100
- This system uses specialized agents (web research, file processing, mathematical reasoning)
1101
- orchestrated through LangGraph to provide accurate, well-reasoned answers to complex questions.
 
 
1102
  """)
1103
 
1104
  # Unit 4 API Section
@@ -1107,19 +1105,27 @@ def create_interface():
1107
  gr.Markdown("""
1108
  ## 🏆 GAIA Benchmark Evaluation
1109
 
1110
- **Official Unit 4 API Integration**
1111
 
1112
- Run the complete GAIA Agent system on all benchmark questions and submit results to the official API.
 
 
 
 
 
 
1113
 
1114
  **Instructions:**
1115
- 1. Log in to your Hugging Face account using the button below (**Full inference access will be requested**)
1116
  2. Click 'Run GAIA Evaluation & Submit All Answers' to process all questions
1117
  3. View your official score and detailed results
1118
 
1119
- ⚠️ **Note**: This may take several minutes to process all questions.
 
 
 
1120
 
1121
- 💡 **OAuth Scopes**: The login will request both `read` and `inference` permissions
1122
- for full model access and optimal performance (30%+ GAIA score expected).
1123
  """)
1124
 
1125
  # Authentication status section
@@ -1194,7 +1200,13 @@ Please log in to access GAIA evaluation features with full inference access.
1194
  gr.Markdown("""
1195
  ## 🧪 Manual Question Testing
1196
 
1197
- Test individual questions with detailed analysis and reasoning.
 
 
 
 
 
 
1198
  """)
1199
 
1200
  with gr.Row():
@@ -1314,31 +1326,92 @@ Please log in to access GAIA evaluation features with full inference access.
1314
 
1315
  # Event handlers for manual testing
1316
  def process_and_update(question, file_input, show_reasoning):
1317
- answer, details, reasoning = app.process_question_detailed(question, file_input, show_reasoning)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1318
 
1319
- # Format answer with markdown
1320
- formatted_answer = f"""
 
 
 
 
 
 
 
1321
  ## 🎯 Answer
1322
 
1323
  {answer}
1324
  """
1325
-
1326
- # Format details
1327
- formatted_details = f"""
1328
  ## 📋 Processing Details
1329
 
1330
  {details}
1331
  """
1332
-
1333
- # Show/hide reasoning based on checkbox
1334
- reasoning_visible = show_reasoning and reasoning.strip()
1335
-
1336
- return (
1337
- formatted_answer,
1338
- formatted_details,
1339
- reasoning if reasoning_visible else "",
1340
- gr.update(visible=reasoning_visible)
1341
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1342
 
1343
  submit_btn.click(
1344
  fn=process_and_update,
@@ -1359,19 +1432,28 @@ Please log in to access GAIA evaluation features with full inference access.
1359
 
1360
  ### 🔧 System Architecture
1361
 
 
1362
  - **Router Agent**: Classifies questions and selects appropriate specialized agents
1363
- - **Web Research Agent**: Handles Wikipedia searches and web research
1364
  - **File Processing Agent**: Processes uploaded files (CSV, images, code, audio)
1365
  - **Reasoning Agent**: Handles mathematical calculations and logical reasoning
1366
  - **Synthesizer Agent**: Combines results from multiple agents into final answers
1367
 
1368
  **Models Used**: Qwen 2.5 (7B/32B/72B) with intelligent tier selection for optimal cost/performance
1369
 
 
 
1370
  ### 📈 Performance Metrics
1371
- - **Success Rate**: 100% on test scenarios
1372
- - **Average Response Time**: ~3 seconds per question
1373
- - **Cost Efficiency**: $0.01-0.40 per question depending on complexity
1374
  - **Architecture**: Multi-agent LangGraph orchestration with intelligent synthesis
 
 
 
 
 
 
1375
  """)
1376
 
1377
  return interface
 
220
  return files[:10] # Return 10 most recent
221
 
222
  class GAIAAgentApp:
223
+ """Production GAIA Agent Application with LangGraph workflow and Qwen models"""
224
 
225
  def __init__(self, hf_token: Optional[str] = None):
226
+ """Initialize the application with LangGraph workflow and Qwen models only"""
227
 
228
  # Priority order: 1) passed hf_token, 2) HF_TOKEN env var
229
  if not hf_token:
230
  hf_token = os.getenv("HF_TOKEN")
231
 
232
+ if not hf_token:
233
+ raise ValueError("HuggingFace token with inference permissions is required. Please set HF_TOKEN environment variable or login with full access.")
234
+
235
  try:
236
+ # Initialize QwenClient with token
237
  from models.qwen_client import QwenClient
238
  self.llm_client = QwenClient(hf_token=hf_token)
 
239
 
240
+ # Initialize LangGraph workflow with tools
241
+ self.workflow = SimpleGAIAWorkflow(self.llm_client)
 
 
 
242
 
243
  self.initialized = True
244
+ logger.info("✅ GAIA Agent system initialized with LangGraph workflow and Qwen models")
245
 
246
  except Exception as e:
247
+ logger.error(f" Failed to initialize GAIA Agent system: {e}")
248
+ raise RuntimeError(f"System initialization failed: {e}. Please ensure HF_TOKEN has inference permissions.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
  @classmethod
251
  def create_with_oauth_token(cls, oauth_token: str) -> "GAIAAgentApp":
252
  """Create a new instance with OAuth token"""
253
+ if not oauth_token:
254
+ raise ValueError("Valid OAuth token is required for GAIA Agent initialization")
255
  return cls(hf_token=oauth_token)
256
 
257
  def __call__(self, question: str) -> str:
 
409
  return "\n".join(reasoning)
410
 
411
  def get_examples(self) -> list:
412
+ """Get example questions for the interface that showcase multi-agent capabilities"""
413
  return [
 
 
 
 
414
  "How many studio albums were published by Mercedes Sosa between 2000 and 2009?",
415
+ "What is the capital of the country that has the most time zones?",
416
+ "Calculate the compound interest on $1000 at 5% annual rate compounded quarterly for 3 years",
417
+ "What is the square root of the sum of the first 10 prime numbers?",
418
+ "Who was the first person to walk on the moon and what year did it happen?",
419
+ "Compare the GDP of Japan and Germany in 2023 and tell me the difference",
420
  ]
421
 
422
  def check_oauth_scopes(oauth_token: str) -> Dict[str, any]:
 
506
  **🚀 FULL SYSTEM CAPABILITIES ENABLED**
507
 
508
  **Authentication Source**: HF_TOKEN environment variable
509
+ **Model Access**: Qwen 2.5 models (7B/32B/72B) via HuggingFace Inference API
510
+ **Workflow**: LangGraph multi-agent system with specialized tools
511
 
512
  **Available Features:**
513
  - ✅ **Advanced Model Access**: Full Qwen model capabilities (7B/32B/72B)
514
  - ✅ **High Performance**: 30%+ expected GAIA score
515
+ - ✅ **LangGraph Workflow**: Multi-agent orchestration with synthesis
516
+ - ✅ **Specialized Agents**: Web research, file processing, mathematical reasoning
517
+ - ✅ **Professional Tools**: Wikipedia, web search, calculator, file processor
518
+ - ✅ **Manual Testing**: Individual question processing with detailed analysis
519
  - ✅ **Official Evaluation**: GAIA benchmark submission
520
 
521
+ 💡 **Status**: Optimal configuration for GAIA benchmark performance with real AI agents.
522
  """
523
 
524
  if not profile:
525
  return """
526
  ### 🔐 Authentication Status: Not Logged In
527
 
528
+ Please log in to access GAIA evaluation with Qwen models and LangGraph workflow.
529
 
530
+ **What you need:**
531
+ - 🔑 HuggingFace login with `read` and `inference` permissions
532
+ - 🤖 Access to Qwen 2.5 models via HF Inference API
533
+ - 🧠 LangGraph multi-agent system capabilities
534
 
535
+ **🔑 OAuth Configuration**: Login requests both `read` and `inference` scopes for Qwen model access.
536
+ **📈 Expected Performance**: 30%+ GAIA score with full LangGraph workflow and Qwen models.
537
+ **⚠️ No Fallbacks**: System requires proper authentication - no simplified responses.
538
  """
539
 
540
  username = profile.username
 
553
  scopes = scope_info.get("scopes", [])
554
  status_parts.append(f"**Detected Scopes**: {', '.join(scopes) if scopes else 'None detected'}")
555
  status_parts.append("")
556
+ status_parts.append("**System Capabilities:**")
557
 
558
  # Safely access capabilities
559
  can_inference = scope_info.get("can_inference", False)
 
561
 
562
  if can_inference:
563
  status_parts.extend([
564
+ "- ✅ **Qwen Model Access**: Full Qwen 2.5 model capabilities (7B/32B/72B)",
565
  "- ✅ **High Performance**: 30%+ expected GAIA score",
566
+ "- ✅ **LangGraph Workflow**: Multi-agent orchestration with synthesis",
567
+ "- ✅ **Specialized Agents**: Web research, file processing, reasoning",
568
+ "- ✅ **Professional Tools**: Wikipedia, web search, calculator, file processor",
569
  "- ✅ **Inference Access**: Full model generation capabilities"
570
  ])
571
  else:
572
  status_parts.extend([
573
+ "- **No Qwen Model Access**: Insufficient OAuth permissions",
574
+ "- **No LangGraph Workflow**: Requires inference permissions",
575
+ "- **Limited Functionality**: Cannot process GAIA questions",
576
+ "- ❌ **No Inference Access**: Read-only permissions detected"
577
  ])
578
 
579
  if can_read:
580
  status_parts.append("- ✅ **Profile Access**: Can read user information")
581
 
582
  status_parts.extend([
583
+ "- ✅ **Manual Testing**: Individual question processing (if authenticated)",
584
+ "- ✅ **Official Evaluation**: GAIA benchmark submission (if authenticated)"
585
  ])
586
 
587
  if not can_inference:
588
  status_parts.extend([
589
  "",
590
+ "🔑 **Authentication Required**: Your OAuth session lacks inference permissions.",
591
+ "**Solution**: Logout and login again to request full inference access.",
592
+ "**Alternative**: Set HF_TOKEN as a Space secret for guaranteed Qwen model access.",
593
+ "**Note**: System requires Qwen model access - no simplified fallbacks available."
594
  ])
595
  else:
596
  status_parts.extend([
597
  "",
598
+ "🎉 **Excellent**: You have full inference access for optimal GAIA performance!",
599
+ "🤖 **Ready**: LangGraph workflow with Qwen models fully operational."
600
  ])
601
 
602
  return "\n".join(status_parts)
603
 
604
  def run_and_submit_all(profile: gr.OAuthProfile | None):
605
  """
606
+ Fetches all questions from Unit 4 API, runs the GAIA Agent with LangGraph workflow,
607
+ and displays the results. Requires proper authentication for Qwen model access.
608
  """
609
  start_time = time.time()
610
 
 
623
  username = "unknown_user"
624
 
625
  if hf_token:
626
+ logger.info("🎯 Using HF_TOKEN environment variable for Qwen model access")
627
  oauth_token = hf_token
628
  username = "hf_token_user"
629
  elif profile:
 
638
  test_response = requests.get("https://huggingface.co/api/whoami", headers=headers, timeout=5)
639
 
640
  if test_response.status_code == 401:
641
+ logger.error(" OAuth token has insufficient scopes for Qwen model inference")
642
+ return "Authentication Error: Your OAuth token lacks inference permissions. Please logout and login again with full access.", None, auth_status, None, None, None
643
  elif test_response.status_code == 200:
644
  logger.info("✅ OAuth token validated successfully")
645
  else:
 
648
  except Exception as e:
649
  logger.warning(f"⚠️ Could not validate OAuth token: {e}")
650
  else:
651
+ logger.error(" No authentication provided")
652
+ return "Authentication Required: Please login with HuggingFace or set HF_TOKEN environment variable with inference permissions.", None, auth_status, None, None, None
653
 
654
  if not oauth_token:
655
+ return "Authentication Required: Valid token with inference permissions needed for Qwen model access.", None, auth_status, None, None, None
656
 
657
  api_url = DEFAULT_API_URL
658
  questions_url = f"{api_url}/questions"
659
  submit_url = f"{api_url}/submit"
660
 
661
+ # 1. Instantiate GAIA Agent with LangGraph workflow
662
  try:
663
+ logger.info("🚀 Creating GAIA Agent with LangGraph workflow and Qwen models")
664
  agent = GAIAAgentApp.create_with_oauth_token(oauth_token)
665
 
666
  if not agent.initialized:
667
+ return "System Error: GAIA Agent failed to initialize with LangGraph workflow", None, auth_status, None, None, None
668
+ except ValueError as ve:
669
+ logger.error(f"Authentication error: {ve}")
670
+ return f"Authentication Error: {ve}", None, auth_status, None, None, None
671
+ except RuntimeError as re:
672
+ logger.error(f"System initialization error: {re}")
673
+ return f"System Error: {re}", None, auth_status, None, None, None
674
  except Exception as e:
675
+ logger.error(f"Unexpected error initializing agent: {e}")
676
+ return f"Unexpected Error: {e}. Please check your authentication and try again.", None, auth_status, None, None, None
677
+
678
  # Agent code URL
679
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "Local Development"
680
  logger.info(f"Agent code URL: {agent_code}")
 
806
  def create_interface():
807
  """Create the Gradio interface with both Unit 4 API and manual testing"""
808
 
809
+ # Note: We don't initialize GAIAAgentApp here since it requires authentication
810
+ # Each request will create its own authenticated instance
811
 
812
  # Custom CSS for better styling
813
  css = """
 
1093
 
1094
  **Advanced Multi-Agent AI System for GAIA Benchmark Questions**
1095
 
1096
+ This system uses **Qwen 2.5 models (7B/32B/72B)** with specialized agents orchestrated through
1097
+ **LangGraph** to provide accurate, well-reasoned answers to complex questions.
1098
+
1099
+ **Architecture**: Router → Specialized Agents → Tools → Synthesizer → Final Answer
1100
  """)
1101
 
1102
  # Unit 4 API Section
 
1105
  gr.Markdown("""
1106
  ## 🏆 GAIA Benchmark Evaluation
1107
 
1108
+ **Official Unit 4 API Integration with LangGraph Workflow**
1109
 
1110
+ Run the complete GAIA Agent system using Qwen 2.5 models and LangGraph multi-agent
1111
+ orchestration on all benchmark questions and submit results to the official API.
1112
+
1113
+ **System Requirements:**
1114
+ 1. 🔑 **Authentication**: HuggingFace login with `read` and `inference` permissions
1115
+ 2. 🤖 **Models**: Access to Qwen 2.5 models (7B/32B/72B) via HF Inference API
1116
+ 3. 🧠 **Workflow**: LangGraph multi-agent system with specialized tools
1117
 
1118
  **Instructions:**
1119
+ 1. Log in to your Hugging Face account using the button below (**Full inference access required**)
1120
  2. Click 'Run GAIA Evaluation & Submit All Answers' to process all questions
1121
  3. View your official score and detailed results
1122
 
1123
+ ⚠️ **Note**: This may take several minutes to process all questions with the multi-agent system.
1124
+
1125
+ 💡 **OAuth Scopes**: Login requests both `read` and `inference` permissions
1126
+ for Qwen model access and optimal performance (30%+ GAIA score expected).
1127
 
1128
+ 🚫 **No Fallbacks**: System requires proper authentication - simplified responses not available.
 
1129
  """)
1130
 
1131
  # Authentication status section
 
1200
  gr.Markdown("""
1201
  ## 🧪 Manual Question Testing
1202
 
1203
+ Test individual questions with detailed analysis using **Qwen models** and **LangGraph workflow**.
1204
+
1205
+ **Features:**
1206
+ - 🤖 **Qwen 2.5 Models**: Intelligent tier selection (7B → 32B → 72B) based on complexity
1207
+ - 🧠 **LangGraph Orchestration**: Multi-agent workflow with synthesis
1208
+ - 🔧 **Specialized Agents**: Router, web research, file processing, mathematical reasoning
1209
+ - 📊 **Detailed Analysis**: Processing details, confidence scores, cost tracking
1210
  """)
1211
 
1212
  with gr.Row():
 
1326
 
1327
  # Event handlers for manual testing
1328
  def process_and_update(question, file_input, show_reasoning):
1329
+ """Process question with authentication check"""
1330
+
1331
+ if not question.strip():
1332
+ return "❌ Please provide a question", "", "", gr.update(visible=False)
1333
+
1334
+ # Check for authentication
1335
+ hf_token = os.getenv("HF_TOKEN")
1336
+
1337
+ if not hf_token:
1338
+ error_msg = """
1339
+ ## ❌ Authentication Required
1340
+
1341
+ **This system requires authentication to access Qwen models and LangGraph workflow.**
1342
+
1343
+ **How to authenticate:**
1344
+ 1. 🔑 **Set HF_TOKEN**: Add your HuggingFace token as an environment variable
1345
+ 2. 🌐 **Use Official Evaluation**: Login via the GAIA Benchmark section above
1346
+ 3. 📝 **Get Token**: Visit https://huggingface.co/settings/tokens to create one with `inference` permissions
1347
+
1348
+ **Note**: Manual testing requires the same authentication as the official evaluation.
1349
+ """
1350
+ return error_msg, "", "", gr.update(visible=False)
1351
 
1352
+ try:
1353
+ # Create authenticated app instance for this request
1354
+ app = GAIAAgentApp(hf_token=hf_token)
1355
+
1356
+ # Process the question
1357
+ answer, details, reasoning = app.process_question_detailed(question, file_input, show_reasoning)
1358
+
1359
+ # Format answer with markdown
1360
+ formatted_answer = f"""
1361
  ## 🎯 Answer
1362
 
1363
  {answer}
1364
  """
1365
+
1366
+ # Format details
1367
+ formatted_details = f"""
1368
  ## 📋 Processing Details
1369
 
1370
  {details}
1371
  """
1372
+
1373
+ # Show/hide reasoning based on checkbox
1374
+ reasoning_visible = show_reasoning and reasoning.strip()
1375
+
1376
+ return (
1377
+ formatted_answer,
1378
+ formatted_details,
1379
+ reasoning if reasoning_visible else "",
1380
+ gr.update(visible=reasoning_visible)
1381
+ )
1382
+
1383
+ except ValueError as ve:
1384
+ error_msg = f"""
1385
+ ## ❌ Authentication Error
1386
+
1387
+ {str(ve)}
1388
+
1389
+ **Solution**: Please ensure your HF_TOKEN has `inference` permissions.
1390
+ """
1391
+ return error_msg, "", "", gr.update(visible=False)
1392
+
1393
+ except RuntimeError as re:
1394
+ error_msg = f"""
1395
+ ## ❌ System Error
1396
+
1397
+ {str(re)}
1398
+
1399
+ **This may be due to:**
1400
+ - Qwen model access issues
1401
+ - HuggingFace Inference API unavailability
1402
+ - Network connectivity problems
1403
+ """
1404
+ return error_msg, "", "", gr.update(visible=False)
1405
+
1406
+ except Exception as e:
1407
+ error_msg = f"""
1408
+ ## ❌ Unexpected Error
1409
+
1410
+ {str(e)}
1411
+
1412
+ **Please try again or contact support if the issue persists.**
1413
+ """
1414
+ return error_msg, "", "", gr.update(visible=False)
1415
 
1416
  submit_btn.click(
1417
  fn=process_and_update,
 
1432
 
1433
  ### 🔧 System Architecture
1434
 
1435
+ **LangGraph Multi-Agent Workflow:**
1436
  - **Router Agent**: Classifies questions and selects appropriate specialized agents
1437
+ - **Web Research Agent**: Handles Wikipedia searches and web research with DuckDuckGo
1438
  - **File Processing Agent**: Processes uploaded files (CSV, images, code, audio)
1439
  - **Reasoning Agent**: Handles mathematical calculations and logical reasoning
1440
  - **Synthesizer Agent**: Combines results from multiple agents into final answers
1441
 
1442
  **Models Used**: Qwen 2.5 (7B/32B/72B) with intelligent tier selection for optimal cost/performance
1443
 
1444
+ **Tools Available**: Wikipedia API, DuckDuckGo web search, mathematical calculator, multi-format file processor
1445
+
1446
  ### 📈 Performance Metrics
1447
+ - **Success Rate**: 30%+ expected on GAIA benchmark with full authentication
1448
+ - **Average Response Time**: ~3-5 seconds per question depending on complexity
1449
+ - **Cost Efficiency**: $0.01-0.40 per question depending on model tier selection
1450
  - **Architecture**: Multi-agent LangGraph orchestration with intelligent synthesis
1451
+ - **Reliability**: Robust error handling and graceful degradation within workflow
1452
+
1453
+ ### 🎯 Authentication Requirements
1454
+ - **HF_TOKEN Environment Variable**: Best performance with full access to Qwen models
1455
+ - **OAuth with Inference Scope**: Full access to Qwen 2.5 models via HuggingFace Inference API
1456
+ - **No Fallback Options**: System requires proper authentication for multi-agent functionality
1457
  """)
1458
 
1459
  return interface
src/models/qwen_client.py CHANGED
@@ -51,49 +51,18 @@ class QwenClient:
51
  """HuggingFace client with fallback model support"""
52
 
53
  def __init__(self, hf_token: Optional[str] = None):
54
- """Initialize the client with HuggingFace token"""
55
  self.hf_token = hf_token or os.getenv("HUGGINGFACE_TOKEN") or os.getenv("HF_TOKEN")
56
  if not self.hf_token:
57
- logger.warning("No HuggingFace token provided. API access may be limited.")
58
 
59
  # Initialize cost tracking first
60
  self.total_cost = 0.0
61
  self.request_count = 0
62
  self.budget_limit = 0.10 # $0.10 total budget
63
 
64
- # Define model configurations with fallbacks
65
  self.models = {
66
- ModelTier.ROUTER: ModelConfig(
67
- name="google/flan-t5-small", # Reliable and fast instruction-following model
68
- tier=ModelTier.ROUTER,
69
- max_tokens=512,
70
- temperature=0.1,
71
- cost_per_token=0.0003,
72
- timeout=15,
73
- requires_special_auth=False
74
- ),
75
- ModelTier.MAIN: ModelConfig(
76
- name="google/flan-t5-base", # Good balance of performance and speed
77
- tier=ModelTier.MAIN,
78
- max_tokens=1024,
79
- temperature=0.1,
80
- cost_per_token=0.0008,
81
- timeout=25,
82
- requires_special_auth=False
83
- ),
84
- ModelTier.COMPLEX: ModelConfig(
85
- name="google/flan-t5-large", # Best available free model
86
- tier=ModelTier.COMPLEX,
87
- max_tokens=2048,
88
- temperature=0.1,
89
- cost_per_token=0.0015,
90
- timeout=35,
91
- requires_special_auth=False
92
- )
93
- }
94
-
95
- # Qwen models as primary choice (will fallback if auth fails)
96
- self.qwen_models = {
97
  ModelTier.ROUTER: ModelConfig(
98
  name="Qwen/Qwen2.5-7B-Instruct",
99
  tier=ModelTier.ROUTER,
@@ -129,62 +98,51 @@ class QwenClient:
129
  self._initialize_clients()
130
 
131
  def _initialize_clients(self):
132
- """Initialize HuggingFace clients with fallback support"""
133
 
134
- # Try Qwen models first (preferred)
135
- if self.hf_token:
136
- logger.info("🎯 Attempting to initialize Qwen models...")
137
- qwen_success = self._try_initialize_models(self.qwen_models, "Qwen")
138
-
139
- if qwen_success:
140
- logger.info("✅ Qwen models initialized successfully")
141
- self.models = self.qwen_models
142
- return
143
- else:
144
- logger.warning("⚠️ Qwen models failed, falling back to standard models")
145
 
146
- # Fallback to standard HF models
147
- logger.info("🔄 Initializing fallback models...")
148
- fallback_success = self._try_initialize_models(self.models, "Fallback")
149
-
150
- if not fallback_success:
151
- logger.error("❌ All model initialization failed")
152
 
153
  # Test the main model to ensure it's working
154
- logger.info("🧪 Testing main model initialization...")
155
  try:
156
- test_result = self.generate("Test", max_tokens=5)
157
  if test_result.success and test_result.response.strip():
158
- logger.info(f"✅ Main model test successful: '{test_result.response.strip()}'")
159
  else:
160
- logger.error(f"❌ Main model test failed - Success: {test_result.success}, Response: '{test_result.response}', Error: {test_result.error}")
 
161
  except Exception as e:
162
- logger.error(f"❌ Main model test exception: {e}")
 
163
 
164
  def _try_initialize_models(self, model_configs: Dict, model_type: str) -> bool:
165
- """Try to initialize a set of models"""
166
  success_count = 0
167
 
168
  for tier, config in model_configs.items():
169
  try:
170
- # Test with simple generation first for Nebius models
171
- if config.requires_special_auth and self.hf_token:
172
- test_client = InferenceClient(
 
 
 
 
 
 
 
173
  model=config.name,
174
- token=self.hf_token
 
175
  )
176
-
177
- # Quick test to verify authentication works
178
- try:
179
- test_response = test_client.text_generation(
180
- "Hello",
181
- max_new_tokens=5,
182
- temperature=0.1
183
- )
184
- logger.info(f"✅ {model_type} auth test passed for {config.name}")
185
- except Exception as auth_error:
186
- logger.warning(f"❌ {model_type} auth failed for {config.name}: {auth_error}")
187
- continue
188
 
189
  # Initialize the clients
190
  self.inference_clients[tier] = InferenceClient(
@@ -303,10 +261,10 @@ class QwenClient:
303
  prompt: str,
304
  tier: Optional[ModelTier] = None,
305
  max_tokens: Optional[int] = None) -> InferenceResult:
306
- """Async text generation with the specified model tier"""
307
 
308
  if tier is None:
309
- tier = self.select_model_tier()
310
 
311
  config = self.models[tier]
312
  client = self.inference_clients.get(tier)
@@ -319,7 +277,7 @@ class QwenClient:
319
  cost_estimate=0.0,
320
  response_time=0.0,
321
  success=False,
322
- error=f"Model {tier.value} not available"
323
  )
324
 
325
  start_time = time.time()
@@ -328,100 +286,31 @@ class QwenClient:
328
  # Use specified max_tokens or model default
329
  tokens = max_tokens or config.max_tokens
330
 
331
- # Use appropriate API based on model type
332
- if config.requires_special_auth:
333
- # Qwen models use chat completion API
334
- messages = [{"role": "user", "content": prompt}]
335
-
336
- response = client.chat_completion(
337
- messages=messages,
338
- model=config.name,
339
- max_tokens=tokens,
340
- temperature=config.temperature
341
- )
342
-
343
- # Extract response from chat completion
344
- if response and response.choices:
345
- response_text = response.choices[0].message.content
346
- else:
347
- raise ValueError("No response received from model")
348
  else:
349
- # Fallback models use text generation API
350
- # Format prompt for instruction-following models like FLAN-T5
351
- formatted_prompt = f"Question: {prompt}\nAnswer:"
352
-
353
- try:
354
- # First attempt: Standard formatted prompt
355
- logger.info(f"Attempting generation with {config.name}...")
356
- response_text = client.text_generation(
357
- formatted_prompt,
358
- max_new_tokens=tokens,
359
- temperature=config.temperature,
360
- return_full_text=False,
361
- do_sample=True if config.temperature > 0 else False
362
- )
363
-
364
- if not response_text or not response_text.strip():
365
- # Try alternative generation method if first fails
366
- logger.warning(f"Empty response from {config.name} attempt 1, trying direct prompt...")
367
- response_text = client.text_generation(
368
- prompt,
369
- max_new_tokens=min(tokens, 50), # Smaller token limit
370
- temperature=0.7, # Higher temperature
371
- return_full_text=False,
372
- do_sample=True
373
- )
374
-
375
- if not response_text or not response_text.strip():
376
- logger.warning(f"Empty response from {config.name} attempt 2, trying simple format...")
377
- # Try even simpler format
378
- response_text = client.text_generation(
379
- f"Answer this: {prompt}",
380
- max_new_tokens=30,
381
- temperature=0.8,
382
- return_full_text=False,
383
- do_sample=True
384
- )
385
-
386
- if not response_text or not response_text.strip():
387
- # Final attempt with minimal parameters
388
- logger.warning(f"Empty response from {config.name} attempt 3, trying minimal config...")
389
- response_text = client.text_generation(
390
- prompt[:100], # Truncate prompt
391
- max_new_tokens=20,
392
- return_full_text=False
393
- )
394
-
395
- if not response_text or not response_text.strip():
396
- error_msg = f"No response received from {config.name} after 4 attempts. Last response: '{response_text}'"
397
- logger.error(f"❌ {error_msg}")
398
- raise ValueError(error_msg)
399
-
400
- except Exception as gen_error:
401
- error_details = str(gen_error)
402
- logger.error(f"❌ Text generation failed for {config.name}: {error_details}")
403
-
404
- # Check for specific error types
405
- if "timeout" in error_details.lower():
406
- raise ValueError(f"Timeout error with {config.name}: {error_details}")
407
- elif "rate limit" in error_details.lower() or "429" in error_details:
408
- raise ValueError(f"Rate limit error with {config.name}: {error_details}")
409
- elif "auth" in error_details.lower() or "401" in error_details:
410
- raise ValueError(f"Authentication error with {config.name}: {error_details}")
411
- else:
412
- raise ValueError(f"Generation error with {config.name}: {error_details}")
413
-
414
- # Final validation
415
- if not response_text or not response_text.strip():
416
- error_msg = f"Final validation failed for {config.name}. Response: '{response_text}'"
417
- logger.error(f"❌ {error_msg}")
418
- raise ValueError(error_msg)
419
 
420
  response_time = time.time() - start_time
421
 
422
  # Clean up response text
423
  response_text = str(response_text).strip()
424
 
 
 
 
425
  # Estimate tokens used (rough approximation)
426
  estimated_tokens = len(prompt.split()) + len(response_text.split())
427
  cost_estimate = estimated_tokens * config.cost_per_token
@@ -430,7 +319,7 @@ class QwenClient:
430
  self.total_cost += cost_estimate
431
  self.request_count += 1
432
 
433
- logger.info(f"✅ Generated response using {tier.value} model in {response_time:.2f}s")
434
 
435
  return InferenceResult(
436
  response=response_text,
@@ -445,22 +334,7 @@ class QwenClient:
445
  response_time = time.time() - start_time
446
  error_msg = str(e)
447
 
448
- # Check for specific authentication errors
449
- if "api_key" in error_msg.lower() or "nebius" in error_msg.lower() or "unauthorized" in error_msg.lower():
450
- logger.error(f"❌ Authentication failed with {tier.value} model: {error_msg}")
451
-
452
- # Try to reinitialize with fallback models if this was a Qwen model
453
- if config.requires_special_auth:
454
- logger.info("�� Attempting to fallback to standard models due to auth failure...")
455
- self._initialize_fallback_emergency()
456
-
457
- # Retry with fallback if available
458
- fallback_client = self.inference_clients.get(tier)
459
- if fallback_client and not self.models[tier].requires_special_auth:
460
- logger.info(f"🔄 Retrying with fallback model...")
461
- return await self.generate_async(prompt, tier, max_tokens)
462
- else:
463
- logger.error(f"❌ Generation failed with {tier.value} model: {error_msg}")
464
 
465
  return InferenceResult(
466
  response="",
@@ -472,44 +346,6 @@ class QwenClient:
472
  error=error_msg
473
  )
474
 
475
- def _initialize_fallback_emergency(self):
476
- """Emergency fallback to standard models when auth fails"""
477
- logger.warning("🚨 Emergency fallback: Switching to standard HF models")
478
-
479
- # Switch to fallback models
480
- self.models = {
481
- ModelTier.ROUTER: ModelConfig(
482
- name="google/flan-t5-small",
483
- tier=ModelTier.ROUTER,
484
- max_tokens=512,
485
- temperature=0.1,
486
- cost_per_token=0.0003,
487
- timeout=15,
488
- requires_special_auth=False
489
- ),
490
- ModelTier.MAIN: ModelConfig(
491
- name="google/flan-t5-base",
492
- tier=ModelTier.MAIN,
493
- max_tokens=1024,
494
- temperature=0.1,
495
- cost_per_token=0.0008,
496
- timeout=25,
497
- requires_special_auth=False
498
- ),
499
- ModelTier.COMPLEX: ModelConfig(
500
- name="google/flan-t5-large",
501
- tier=ModelTier.COMPLEX,
502
- max_tokens=2048,
503
- temperature=0.1,
504
- cost_per_token=0.0015,
505
- timeout=35,
506
- requires_special_auth=False
507
- )
508
- }
509
-
510
- # Reinitialize with fallback models
511
- self._try_initialize_models(self.models, "Emergency Fallback")
512
-
513
  def generate(self,
514
  prompt: str,
515
  tier: Optional[ModelTier] = None,
 
51
  """HuggingFace client with fallback model support"""
52
 
53
  def __init__(self, hf_token: Optional[str] = None):
54
+ """Initialize the client with HuggingFace token for Qwen models only"""
55
  self.hf_token = hf_token or os.getenv("HUGGINGFACE_TOKEN") or os.getenv("HF_TOKEN")
56
  if not self.hf_token:
57
+ raise ValueError("HuggingFace token is required for Qwen model access. Please provide HF_TOKEN or login with inference permissions.")
58
 
59
  # Initialize cost tracking first
60
  self.total_cost = 0.0
61
  self.request_count = 0
62
  self.budget_limit = 0.10 # $0.10 total budget
63
 
64
+ # Define Qwen model configurations (only these models)
65
  self.models = {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  ModelTier.ROUTER: ModelConfig(
67
  name="Qwen/Qwen2.5-7B-Instruct",
68
  tier=ModelTier.ROUTER,
 
98
  self._initialize_clients()
99
 
100
  def _initialize_clients(self):
101
+ """Initialize HuggingFace clients for Qwen models only"""
102
 
103
+ logger.info("🎯 Initializing Qwen models via HuggingFace Inference API...")
104
+ success = self._try_initialize_models(self.models, "Qwen")
 
 
 
 
 
 
 
 
 
105
 
106
+ if not success:
107
+ raise RuntimeError("Failed to initialize any Qwen models. Please check your HF_TOKEN has inference permissions and try again.")
 
 
 
 
108
 
109
  # Test the main model to ensure it's working
110
+ logger.info("🧪 Testing Qwen model connectivity...")
111
  try:
112
+ test_result = self.generate("Hello", max_tokens=10)
113
  if test_result.success and test_result.response.strip():
114
+ logger.info(f"✅ Qwen models ready: '{test_result.response.strip()}'")
115
  else:
116
+ logger.error(f"❌ Qwen model test failed: {test_result}")
117
+ raise RuntimeError("Qwen models failed connectivity test")
118
  except Exception as e:
119
+ logger.error(f"❌ Qwen model test exception: {e}")
120
+ raise RuntimeError(f"Qwen model initialization failed: {e}")
121
 
122
  def _try_initialize_models(self, model_configs: Dict, model_type: str) -> bool:
123
+ """Try to initialize Qwen models"""
124
  success_count = 0
125
 
126
  for tier, config in model_configs.items():
127
  try:
128
+ # Test Qwen model authentication
129
+ test_client = InferenceClient(
130
+ model=config.name,
131
+ token=self.hf_token
132
+ )
133
+
134
+ # Quick test to verify authentication and model access
135
+ try:
136
+ test_response = test_client.chat_completion(
137
+ messages=[{"role": "user", "content": "Hello"}],
138
  model=config.name,
139
+ max_tokens=5,
140
+ temperature=0.1
141
  )
142
+ logger.info(f"✅ {model_type} auth test passed for {config.name}")
143
+ except Exception as auth_error:
144
+ logger.warning(f"❌ {model_type} auth failed for {config.name}: {auth_error}")
145
+ continue
 
 
 
 
 
 
 
 
146
 
147
  # Initialize the clients
148
  self.inference_clients[tier] = InferenceClient(
 
261
  prompt: str,
262
  tier: Optional[ModelTier] = None,
263
  max_tokens: Optional[int] = None) -> InferenceResult:
264
+ """Async text generation with Qwen models via HuggingFace Inference API"""
265
 
266
  if tier is None:
267
+ tier = self.select_model_tier(question_text=prompt)
268
 
269
  config = self.models[tier]
270
  client = self.inference_clients.get(tier)
 
277
  cost_estimate=0.0,
278
  response_time=0.0,
279
  success=False,
280
+ error=f"Qwen model {tier.value} not available"
281
  )
282
 
283
  start_time = time.time()
 
286
  # Use specified max_tokens or model default
287
  tokens = max_tokens or config.max_tokens
288
 
289
+ # Qwen models use chat completion API
290
+ messages = [{"role": "user", "content": prompt}]
291
+
292
+ logger.info(f"🤖 Generating with {config.name}...")
293
+ response = client.chat_completion(
294
+ messages=messages,
295
+ model=config.name,
296
+ max_tokens=tokens,
297
+ temperature=config.temperature
298
+ )
299
+
300
+ # Extract response from chat completion
301
+ if response and response.choices:
302
+ response_text = response.choices[0].message.content
 
 
 
303
  else:
304
+ raise ValueError(f"No response received from {config.name}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
 
306
  response_time = time.time() - start_time
307
 
308
  # Clean up response text
309
  response_text = str(response_text).strip()
310
 
311
+ if not response_text:
312
+ raise ValueError(f"Empty response from {config.name}")
313
+
314
  # Estimate tokens used (rough approximation)
315
  estimated_tokens = len(prompt.split()) + len(response_text.split())
316
  cost_estimate = estimated_tokens * config.cost_per_token
 
319
  self.total_cost += cost_estimate
320
  self.request_count += 1
321
 
322
+ logger.info(f"✅ Generated with {tier.value} model in {response_time:.2f}s")
323
 
324
  return InferenceResult(
325
  response=response_text,
 
334
  response_time = time.time() - start_time
335
  error_msg = str(e)
336
 
337
+ logger.error(f"❌ Generation failed with {tier.value} model ({config.name}): {error_msg}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
 
339
  return InferenceResult(
340
  response="",
 
346
  error=error_msg
347
  )
348
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
  def generate(self,
350
  prompt: str,
351
  tier: Optional[ModelTier] = None,
src/models/simple_client.py DELETED
@@ -1,165 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Simple Model Client for GAIA Agent
4
- Provides reliable basic functionality when advanced models fail
5
- """
6
-
7
- import logging
8
- import time
9
- from typing import Optional
10
- from dataclasses import dataclass
11
- from enum import Enum
12
-
13
- # Configure logging
14
- logging.basicConfig(level=logging.INFO)
15
- logger = logging.getLogger(__name__)
16
-
17
- class ModelTier(Enum):
18
- """Model complexity tiers"""
19
- ROUTER = "router"
20
- MAIN = "main"
21
- COMPLEX = "complex"
22
-
23
- @dataclass
24
- class InferenceResult:
25
- """Result of model inference"""
26
- response: str
27
- model_used: str
28
- tokens_used: int
29
- cost_estimate: float
30
- response_time: float
31
- success: bool
32
- error: Optional[str] = None
33
-
34
- class SimpleClient:
35
- """Simple client that provides reliable basic functionality"""
36
-
37
- def __init__(self, hf_token: Optional[str] = None):
38
- """Initialize simple client"""
39
- self.hf_token = hf_token
40
- self.total_cost = 0.0
41
- self.request_count = 0
42
- self.budget_limit = 0.10
43
- logger.info("✅ Simple client initialized - using rule-based responses")
44
-
45
- def get_model_status(self) -> dict:
46
- """Always return available models"""
47
- return {
48
- "router": True,
49
- "main": True,
50
- "complex": True
51
- }
52
-
53
- def select_model_tier(self, complexity: str = "medium", budget_conscious: bool = True, question_text: str = "") -> ModelTier:
54
- """Simple model selection"""
55
- if "calculate" in question_text.lower() or "math" in question_text.lower():
56
- return ModelTier.COMPLEX
57
- elif len(question_text) > 100:
58
- return ModelTier.MAIN
59
- else:
60
- return ModelTier.ROUTER
61
-
62
- def generate(self, prompt: str, tier: Optional[ModelTier] = None, max_tokens: Optional[int] = None) -> InferenceResult:
63
- """Generate response using simple rules and patterns"""
64
-
65
- start_time = time.time()
66
-
67
- if tier is None:
68
- tier = self.select_model_tier(question_text=prompt)
69
-
70
- try:
71
- response = self._generate_simple_response(prompt)
72
- response_time = time.time() - start_time
73
-
74
- # Track usage
75
- estimated_tokens = len(prompt.split()) + len(response.split())
76
- cost_estimate = estimated_tokens * 0.0001 # Very low cost
77
- self.total_cost += cost_estimate
78
- self.request_count += 1
79
-
80
- logger.info(f"✅ Generated simple response using {tier.value} in {response_time:.2f}s")
81
-
82
- return InferenceResult(
83
- response=response,
84
- model_used=f"simple-{tier.value}",
85
- tokens_used=estimated_tokens,
86
- cost_estimate=cost_estimate,
87
- response_time=response_time,
88
- success=True
89
- )
90
-
91
- except Exception as e:
92
- response_time = time.time() - start_time
93
- logger.error(f"❌ Simple generation failed: {e}")
94
-
95
- return InferenceResult(
96
- response="",
97
- model_used=f"simple-{tier.value}",
98
- tokens_used=0,
99
- cost_estimate=0.0,
100
- response_time=response_time,
101
- success=False,
102
- error=str(e)
103
- )
104
-
105
- def _generate_simple_response(self, prompt: str) -> str:
106
- """Generate response using simple rules"""
107
-
108
- prompt_lower = prompt.lower()
109
-
110
- # Mathematical questions
111
- if any(word in prompt_lower for word in ["calculate", "math", "number", "sum", "average", "+", "sqrt", "square root"]):
112
- if "2+2" in prompt_lower or "2 + 2" in prompt_lower or ("what is 2" in prompt_lower and "2" in prompt_lower):
113
- return "The answer to 2+2 is 4. This is a basic arithmetic calculation where we add two units to two units, resulting in four units total."
114
- elif "25%" in prompt_lower and "200" in prompt_lower:
115
- return "25% of 200 is 50. To calculate this: 25% = 0.25, and 0.25 × 200 = 50."
116
- elif "square root" in prompt_lower and "144" in prompt_lower:
117
- return "The square root of 144 is 12, because 12 × 12 = 144."
118
- elif "average" in prompt_lower and "10" in prompt_lower and "15" in prompt_lower and "20" in prompt_lower:
119
- return "The average of 10, 15, and 20 is 15. Calculated as: (10 + 15 + 20) ÷ 3 = 45 ÷ 3 = 15."
120
- else:
121
- return "I can help with mathematical calculations. Please provide specific numbers and operations."
122
-
123
- # Geography questions
124
- if "capital" in prompt_lower and "france" in prompt_lower:
125
- return "The capital of France is Paris."
126
-
127
- # General questions
128
- if "hello" in prompt_lower or "how are you" in prompt_lower:
129
- return "Hello! I'm functioning well and ready to help with your questions."
130
-
131
- # Complex analysis questions
132
- if any(word in prompt_lower for word in ["analyze", "explain", "reasoning"]):
133
- return f"Based on the question '{prompt[:100]}...', I would need to analyze multiple factors and provide detailed reasoning. This requires careful consideration of the available information and logical analysis."
134
-
135
- # Research questions
136
- if any(word in prompt_lower for word in ["who", "what", "when", "where", "research"]):
137
- return f"To answer this question about '{prompt[:50]}...', I would need to research reliable sources and provide accurate information based on available data."
138
-
139
- # Default response
140
- return f"I understand you're asking about '{prompt[:100]}...'. Let me provide a thoughtful response based on the information available and logical reasoning."
141
-
142
- def get_langchain_llm(self, tier: ModelTier):
143
- """Return None - no LangChain integration for simple client"""
144
- return None
145
-
146
- def get_usage_stats(self) -> dict:
147
- """Get usage statistics"""
148
- return {
149
- "total_cost": self.total_cost,
150
- "request_count": self.request_count,
151
- "budget_limit": self.budget_limit,
152
- "budget_remaining": self.budget_limit - self.total_cost,
153
- "budget_used_percent": (self.total_cost / self.budget_limit) * 100,
154
- "average_cost_per_request": self.total_cost / max(self.request_count, 1),
155
- "models_available": self.get_model_status()
156
- }
157
-
158
- def reset_usage_tracking(self):
159
- """Reset usage statistics"""
160
- self.total_cost = 0.0
161
- self.request_count = 0
162
- logger.info("Usage tracking reset")
163
-
164
- # Create alias for compatibility
165
- QwenClient = SimpleClient
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/production_deployment_guide.md CHANGED
@@ -1,158 +1,108 @@
1
  # 🚀 GAIA Agent Production Deployment Guide
2
 
3
- ## Issue Resolution: OAuth Authentication
4
 
5
- ### Problem Identified
6
 
7
- The production system was failing with 0% success rate because:
 
 
 
 
 
8
 
9
- - **Production (HF Spaces)**: Uses OAuth authentication (no HF_TOKEN environment variable)
10
- - **Local Development**: Uses HF_TOKEN from .env file
11
- - **Code Issue**: System was hardcoded to look for environment variables only
12
- - **Secondary Issue**: HuggingFace Inference API model compatibility problems
13
 
14
- ### Solution Implemented
15
-
16
- Created a **robust 3-tier fallback system** with **OAuth scope detection**:
17
-
18
- 1. **OAuth Token Support**: `GAIAAgentApp.create_with_oauth_token(oauth_token)`
19
- 2. **Automatic Fallback**: When main models fail, falls back to SimpleClient
20
- 3. **Rule-Based Responses**: SimpleClient provides reliable answers for common questions
21
- 4. **Always Works**: System guaranteed to provide responses in production
22
- 5. **OAuth Scope Detection**: Real-time display of user authentication capabilities
23
-
24
- #### Technical Implementation:
25
 
26
  ```python
27
- # 1. OAuth Token Extraction & Scope Detection
28
- def run_and_submit_all(profile: gr.OAuthProfile | None):
29
- oauth_token = getattr(profile, 'oauth_token', None) or getattr(profile, 'token', None)
30
- agent = GAIAAgentApp.create_with_oauth_token(oauth_token)
31
- # Returns auth status for UI display
32
- auth_status = format_auth_status(profile)
33
-
34
- # 2. OAuth Scope Detection
35
- def check_oauth_scopes(oauth_token: str):
36
- # Tests read capability via whoami endpoint
37
- can_read = requests.get("https://huggingface.co/api/whoami", headers=headers).status_code == 200
38
- # Tests inference capability via model API
39
- can_inference = inference_response.status_code in [200, 503]
40
-
41
- # 3. Dynamic UI Status Display
42
- def format_auth_status(profile):
43
- # Shows detected scopes and available features
44
- # Provides clear performance expectations
45
- # Educational messaging about OAuth limitations
46
-
47
- # 4. Robust Fallback System
48
- def __init__(self, hf_token: Optional[str] = None):
49
- try:
50
- # Try main QwenClient with OAuth
51
- self.llm_client = QwenClient(hf_token=hf_token)
52
- # Test if working
53
- test_result = self.llm_client.generate("Test", max_tokens=5)
54
- if not test_result.success:
55
- raise Exception("Main client not working")
56
- except Exception:
57
- # Fallback to SimpleClient
58
- self.llm_client = SimpleClient(hf_token=hf_token)
59
-
60
- # 5. SimpleClient Rule-Based Responses
61
- class SimpleClient:
62
- def _generate_simple_response(self, prompt):
63
- # Mathematics: "2+2" → "4", "25% of 200" → "50"
64
- # Geography: "capital of France" → "Paris"
65
- # Always provides meaningful responses
66
- ```
67
-
68
- #### OAuth Scope Detection UI Features:
69
 
70
- - **Real-time Authentication Status**: Shows login state and detected scopes
71
- - **Capability Display**: Clear indication of available features based on scopes
72
- - **Performance Expectations**: 30%+ with inference scope, 15%+ with limited scopes
73
- - **Manual Refresh**: Users can update auth status with refresh button
74
- - **Educational Messaging**: Clear explanations of OAuth limitations
75
 
76
- ## 🎯 Expected Results
77
 
78
- After successful deployment with enhanced fallback system:
79
 
80
- ### **🚀 Performance Guarantees:**
 
 
 
 
81
 
82
- 1. **With HF_TOKEN + Working Models**: 25-35% GAIA score, full capabilities
83
- 2. **With HF_TOKEN + Failed Models**: 15-20% GAIA score, SimpleClient fallback
84
- 3. **OAuth Only**: 15-20% GAIA score, SimpleClient fallback
85
- 4. **No Authentication**: Basic functionality, SimpleClient responses
86
 
87
- ### **🔧 System Reliability:**
88
- - **100% Uptime**: Always provides responses (guaranteed SimpleClient fallback)
89
- - **3-Tier Fallback**: Qwen → FLAN-T5 → SimpleClient (never fails)
90
- - **Smart Error Recovery**: Advanced retry logic with multiple generation attempts
91
- - **Enhanced Debugging**: Detailed error reporting for troubleshooting
92
 
93
- ### **📊 Latest Production Fixes (v2.2):**
94
 
95
- #### Dynamic Authentication Detection ✅
96
  ```python
97
- # Real-time login state monitoring:
98
- interface.load(
99
- fn=check_login_state,
100
- outputs=[auth_status_display, unit4_run_button],
101
- every=2 # Check every 2 seconds for login state changes
102
- )
103
-
104
- # Button state updates based on login:
105
- if profile:
106
- button_update = gr.update(interactive=True, value="🚀 Run GAIA Evaluation & Submit All Answers")
107
- else:
108
- button_update = gr.update(interactive=False, value="🔒 Login Required for GAIA Evaluation")
109
- ```
110
 
111
- #### Model Initialization Bug Fixes
112
- ```python
113
- # Fixed QwenClient total_cost attribute error:
114
- def __init__(self, hf_token: Optional[str] = None):
115
- # Initialize cost tracking FIRST
116
- self.total_cost = 0.0
117
- self.request_count = 0
118
- self.budget_limit = 0.10
119
- # Then initialize models...
120
  ```
121
 
122
- #### Enhanced FLAN-T5 Generation ✅
123
- ```python
124
- # 4-attempt generation strategy:
125
- 1. Standard formatted prompt: "Question: {prompt}\nAnswer:"
126
- 2. Direct prompt with higher temperature
127
- 3. Simple format: "Answer this: {prompt}"
128
- 4. Minimal config with truncated prompt
129
- # Each with detailed error logging and specific error type detection
130
- ```
131
 
132
- #### OAuth Scope Request Configuration ✅
133
  ```python
134
- # OAuth now requests full inference access upfront:
135
- oauth_config = {
136
- "scopes": ["read", "inference"], # Request both read and inference access
 
 
 
 
 
 
 
 
 
 
 
137
  }
 
138
 
139
- # Environment variables for HF Spaces:
140
- os.environ["OAUTH_SCOPES"] = "read,inference"
 
 
 
 
141
 
142
- # Login button updated:
143
- login_button = gr.LoginButton(
144
- value="🔑 Login with Full Inference Access"
145
- )
 
 
146
  ```
147
 
148
  ## 🎯 Deployment Steps
149
 
150
  ### 1. Pre-Deployment Checklist
151
 
152
- - [ ] **Code Ready**: All OAuth authentication changes committed
153
- - [ ] **Dependencies**: `requirements.txt` updated with all packages
154
- - [ ] **Testing**: OAuth authentication test passes locally
155
  - [ ] **Environment**: No hardcoded tokens in code
 
156
 
157
  ### 2. HuggingFace Space Configuration
158
 
@@ -178,11 +128,12 @@ suggested_storage: "small"
178
  ```
179
  /
180
  ├── src/
181
- │ ├── app.py # Main application (OAuth-enabled)
182
- │ └── qwen_client.py # OAuth-compatible client
 
183
  │ ├── agents/ # All agent files
184
  │ ├── tools/ # All tool files
185
- │ ├── workflow/ # Workflow orchestration
186
  │ └── requirements.txt # All dependencies
187
  ├── README.md # Space documentation
188
  └── .gitignore # Exclude sensitive files
@@ -190,12 +141,12 @@ suggested_storage: "small"
190
 
191
  ### 4. Environment Variables (Space Secrets)
192
 
193
- **🎯 CRITICAL: Set HF_TOKEN for Full Model Access**
194
 
195
- To get the **real GAIA Agent performance** (not SimpleClient fallback), you **MUST** set `HF_TOKEN` as a Space secret:
196
 
197
  ```bash
198
- # Required for full model access and GAIA performance
199
  HF_TOKEN=hf_your_token_here # REQUIRED: Your HuggingFace token
200
  ```
201
 
@@ -212,7 +163,7 @@ HF_TOKEN=hf_your_token_here # REQUIRED: Your HuggingFace token
212
  - Token must have **`read`** and **`inference`** scopes
213
  - Generate token at: https://huggingface.co/settings/tokens
214
  - Select "Fine-grained" token type
215
- - Enable both scopes for full functionality
216
 
217
  **Optional environment variables:**
218
 
@@ -223,39 +174,35 @@ LANGCHAIN_API_KEY=your_key_here # Optional: LangSmith API key
223
  LANGCHAIN_PROJECT=gaia-agent # Optional: LangSmith project
224
  ```
225
 
226
- **⚠️ DO NOT SET**: The system automatically handles OAuth in production when HF_TOKEN is available.
227
-
228
  ### 5. Authentication Flow in Production
229
 
230
  ```python
231
  # Production OAuth Flow:
232
  1. User clicks "Login with HuggingFace" button
233
  2. OAuth flow provides profile with token
234
- 3. System validates OAuth token scopes
235
- 4. If sufficient scopes: Use OAuth token for model access
236
- 5. If limited scopes: Gracefully fallback to SimpleClient
237
- 6. Always provides working responses regardless of token scopes
238
  ```
239
 
240
- #### OAuth Scope Limitations ⚠️
241
 
242
- **Common Issue**: Gradio OAuth tokens often have **limited scopes** by default:
243
  - ✅ **"read" scope**: Can access user profile, model info
244
- - ❌ **"inference" scope**: Cannot access model generation APIs
245
- - ❌ **"write" scope**: Cannot perform model inference
246
 
247
  **System Behavior**:
248
- - **High-scope token**: Uses advanced models (Qwen, FLAN-T5) → 30%+ GAIA performance
249
- - **Limited-scope token**: Uses SimpleClient fallback15%+ GAIA performance
250
- - **No token**: Uses SimpleClient fallback15%+ GAIA performance
251
 
252
- **Detection & Handling**:
253
  ```python
254
- # Automatic scope validation
255
- test_response = requests.get("https://huggingface.co/api/whoami", headers=headers)
256
  if test_response.status_code == 401:
257
- # Limited scopes detected - use fallback
258
- oauth_token = None
259
  ```
260
 
261
  ### 6. Deployment Process
@@ -273,10 +220,11 @@ if test_response.status_code == 401:
273
  - Ensure `app.py` is the main entry point
274
  - Include all dependencies in `requirements.txt`
275
 
276
- 3. **Test OAuth**:
277
  - Space automatically enables OAuth for Gradio apps
278
  - Test login/logout functionality
279
- - Verify GAIA evaluation works
 
280
 
281
  ### 7. Verification Steps
282
 
@@ -284,7 +232,10 @@ After deployment, verify these work:
284
 
285
  - [ ] **Interface Loads**: Gradio interface appears correctly
286
  - [ ] **OAuth Login**: Login button works and shows user profile
287
- - [ ] **Manual Testing**: Individual questions work with OAuth
 
 
 
288
  - [ ] **GAIA Evaluation**: Full evaluation runs and submits to Unit 4 API
289
  - [ ] **Results Display**: Scores and detailed results show correctly
290
 
@@ -292,115 +243,72 @@ After deployment, verify these work:
292
 
293
  #### Common Issues
294
 
295
- **Issue**: "GAIA Agent failed to initialize"
296
- **Solution**: Check OAuth token extraction in logs
297
 
298
- **Issue**: "401 Unauthorized" errors
299
- **Solution**: Verify OAuth token is being passed correctly
300
 
301
- **Issue**: "No response from models"
302
- **Solution**: Check HuggingFace model access permissions
303
 
304
  #### Debug Commands
305
 
306
  ```python
307
- # In Space, add debug logging to check OAuth:
 
308
  logger.info(f"OAuth token available: {oauth_token is not None}")
309
- logger.info(f"Token length: {len(oauth_token) if oauth_token else 0}")
310
  ```
311
 
312
  ### 9. Performance Optimization
313
 
314
- For production efficiency:
315
 
316
  ```python
317
- # Model Selection Strategy
318
- - Simple questions: 7B model (fast, cheap)
319
- - Medium complexity: 32B model (balanced)
320
- - Complex reasoning: 72B model (best quality)
321
  - Budget management: Auto-downgrade when budget exceeded
 
322
  ```
323
 
324
  ### 10. Monitoring and Maintenance
325
 
326
  **Key Metrics to Monitor**:
327
 
328
- - Success rate on GAIA evaluation
329
  - Average response time per question
330
  - Cost per question processed
331
- - Error rates by question type
 
332
 
333
  **Regular Maintenance**:
334
 
335
- - Monitor HuggingFace model availability
336
  - Update dependencies for security
337
- - Review and optimize agent performance
338
  - Check Unit 4 API compatibility
 
339
 
340
- ## 🔧 OAuth Implementation Details
341
 
342
- ### Token Extraction
343
 
344
- ```python
345
- def run_and_submit_all(profile: gr.OAuthProfile | None):
346
- oauth_token = getattr(profile, 'oauth_token', None) or getattr(profile, 'token', None)
347
- agent = GAIAAgentApp.create_with_oauth_token(oauth_token)
348
- ```
349
-
350
- ### Client Creation
351
-
352
- ```python
353
- class GAIAAgentApp:
354
- def __init__(self, hf_token: Optional[str] = None):
355
- try:
356
- # Try main QwenClient with OAuth
357
- self.llm_client = QwenClient(hf_token=hf_token)
358
- # Test if working
359
- test_result = self.llm_client.generate("Test", max_tokens=5)
360
- if not test_result.success:
361
- raise Exception("Main client not working")
362
- except Exception:
363
- # Fallback to SimpleClient
364
- self.llm_client = SimpleClient(hf_token=hf_token)
365
-
366
- @classmethod
367
- def create_with_oauth_token(cls, oauth_token: str):
368
- return cls(hf_token=oauth_token)
369
- ```
370
-
371
- ## 📈 Success Metrics
372
-
373
- ### Local Test Results ✅
374
-
375
- - **Tool Integration**: 100% success rate
376
- - **Agent Processing**: 100% success rate
377
- - **Full Pipeline**: 100% success rate
378
- - **OAuth Authentication**: ✅ Working
379
-
380
- ### Production Targets 🎯
381
 
382
  - **GAIA Benchmark**: 30%+ success rate
383
- - **Unit 4 API**: Full integration working
384
- - **User Experience**: Professional OAuth-enabled interface
385
- - **System Reliability**: <1% error rate
386
-
387
- ## 🚀 Ready for Deployment
388
-
389
- **✅ OAUTH AUTHENTICATION ISSUE COMPLETELY RESOLVED**
390
-
391
- The system now has **guaranteed reliability** in production:
392
-
393
- - **OAuth Integration**: ✅ Working with HuggingFace authentication
394
- - **Fallback System**: ✅ 3-tier redundancy ensures always-working responses
395
- - **Production Ready**: ✅ No more 0% success rates or authentication failures
396
- - **User Experience**: ✅ Professional interface with reliable functionality
397
 
398
  ### Final Status:
399
- - **Problem**: 0% GAIA success rate due to OAuth authentication mismatch
400
- - **Solution**: Robust 3-tier fallback system with OAuth support
401
- - **Result**: Guaranteed working system with 15%+ minimum GAIA success rate
 
402
  - **Deployment**: Ready for immediate HuggingFace Space deployment
403
 
404
- **The authentication barrier has been eliminated. The GAIA Agent is now production-ready!** 🎉
405
-
406
- The system is now OAuth-compatible and ready for production deployment to HuggingFace Spaces. The authentication issue has been resolved, and the system is guaranteed to provide working responses in all scenarios.
 
1
  # 🚀 GAIA Agent Production Deployment Guide
2
 
3
+ ## System Architecture: Qwen Models + LangGraph Workflow
4
 
5
+ ### **🎯 Updated System Requirements**
6
 
7
+ **GAIA Agent now uses ONLY:**
8
+ - ✅ **Qwen 2.5 Models**: 7B/32B/72B via HuggingFace Inference API
9
+ - ✅ **LangGraph Workflow**: Multi-agent orchestration with synthesis
10
+ - ✅ **Specialized Agents**: Router, web research, file processing, reasoning
11
+ - ✅ **Professional Tools**: Wikipedia, web search, calculator, file processor
12
+ - ❌ **No Fallbacks**: Requires proper authentication - no simplified responses
13
 
14
+ ### **🚨 Authentication Requirements - CRITICAL**
 
 
 
15
 
16
+ **The system now REQUIRES proper authentication:**
 
 
 
 
 
 
 
 
 
 
17
 
18
  ```python
19
+ # REQUIRED: HuggingFace token with inference permissions
20
+ HF_TOKEN=hf_your_token_here
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ # The system will FAIL without proper authentication
23
+ # No SimpleClient fallback available
24
+ ```
 
 
25
 
26
+ ### **🎯 Expected Results**
27
 
28
+ With proper authentication and Qwen model access:
29
 
30
+ - **✅ GAIA Benchmark Score**: 30%+ (full LangGraph workflow with Qwen models)
31
+ - **✅ Multi-Agent Processing**: Router → Specialized Agents → Tools → Synthesis
32
+ - **✅ Intelligent Model Selection**: 7B (fast) → 32B (balanced) → 72B (complex)
33
+ - **✅ Professional Tools**: Wikipedia API, DuckDuckGo search, calculator, file processor
34
+ - **✅ Detailed Analysis**: Processing details, confidence scores, cost tracking
35
 
36
+ **Without proper authentication:**
37
+ - **❌ System Initialization Fails**: No fallback options available
38
+ - **❌ Clear Error Messages**: Guides users to proper authentication setup
 
39
 
40
+ ## 🔧 Technical Implementation
 
 
 
 
41
 
42
+ ### OAuth Authentication (Production)
43
 
 
44
  ```python
45
+ class GAIAAgentApp:
46
+ def __init__(self, hf_token: Optional[str] = None):
47
+ if not hf_token:
48
+ raise ValueError("HuggingFace token with inference permissions is required")
49
+
50
+ # Initialize QwenClient with token
51
+ self.llm_client = QwenClient(hf_token=hf_token)
52
+
53
+ # Initialize LangGraph workflow with tools
54
+ self.workflow = SimpleGAIAWorkflow(self.llm_client)
 
 
 
55
 
56
+ # OAuth token extraction in production
57
+ def run_and_submit_all(profile: gr.OAuthProfile | None):
58
+ oauth_token = getattr(profile, 'oauth_token', None)
59
+ agent = GAIAAgentApp.create_with_oauth_token(oauth_token)
 
 
 
 
 
60
  ```
61
 
62
+ ### Qwen Model Configuration
 
 
 
 
 
 
 
 
63
 
 
64
  ```python
65
+ # QwenClient now uses ONLY Qwen models
66
+ self.models = {
67
+ ModelTier.ROUTER: ModelConfig(
68
+ name="Qwen/Qwen2.5-7B-Instruct", # Fast classification
69
+ cost_per_token=0.0003
70
+ ),
71
+ ModelTier.MAIN: ModelConfig(
72
+ name="Qwen/Qwen2.5-32B-Instruct", # Balanced performance
73
+ cost_per_token=0.0008
74
+ ),
75
+ ModelTier.COMPLEX: ModelConfig(
76
+ name="Qwen/Qwen2.5-72B-Instruct", # Best performance
77
+ cost_per_token=0.0015
78
+ )
79
  }
80
+ ```
81
 
82
+ ### Error Handling
83
+
84
+ ```python
85
+ # Clear error messages guide users to proper authentication
86
+ if not oauth_token:
87
+ return "Authentication Required: Valid token with inference permissions needed for Qwen model access."
88
 
89
+ try:
90
+ agent = GAIAAgentApp.create_with_oauth_token(oauth_token)
91
+ except ValueError as ve:
92
+ return f"Authentication Error: {ve}"
93
+ except RuntimeError as re:
94
+ return f"System Error: {re}"
95
  ```
96
 
97
  ## 🎯 Deployment Steps
98
 
99
  ### 1. Pre-Deployment Checklist
100
 
101
+ - [ ] **Code Ready**: All Qwen-only changes committed
102
+ - [ ] **Dependencies**: `requirements.txt` updated with all packages
103
+ - [ ] **Testing**: QwenClient initialization test passes locally
104
  - [ ] **Environment**: No hardcoded tokens in code
105
+ - [ ] **Authentication**: HF_TOKEN available with inference permissions
106
 
107
  ### 2. HuggingFace Space Configuration
108
 
 
128
  ```
129
  /
130
  ├── src/
131
+ │ ├── app.py # Main application (Qwen + LangGraph)
132
+ ├── models/
133
+ │ │ └── qwen_client.py # Qwen-only client
134
  │ ├── agents/ # All agent files
135
  │ ├── tools/ # All tool files
136
+ │ ├── workflow/ # LangGraph workflow
137
  │ └── requirements.txt # All dependencies
138
  ├── README.md # Space documentation
139
  └── .gitignore # Exclude sensitive files
 
141
 
142
  ### 4. Environment Variables (Space Secrets)
143
 
144
+ **🎯 CRITICAL: Set HF_TOKEN for Qwen Model Access**
145
 
146
+ To get **real GAIA Agent performance** with Qwen models and LangGraph workflow:
147
 
148
  ```bash
149
+ # REQUIRED for Qwen model access and LangGraph functionality
150
  HF_TOKEN=hf_your_token_here # REQUIRED: Your HuggingFace token
151
  ```
152
 
 
163
  - Token must have **`read`** and **`inference`** scopes
164
  - Generate token at: https://huggingface.co/settings/tokens
165
  - Select "Fine-grained" token type
166
+ - Enable both scopes for Qwen model functionality
167
 
168
  **Optional environment variables:**
169
 
 
174
  LANGCHAIN_PROJECT=gaia-agent # Optional: LangSmith project
175
  ```
176
 
 
 
177
  ### 5. Authentication Flow in Production
178
 
179
  ```python
180
  # Production OAuth Flow:
181
  1. User clicks "Login with HuggingFace" button
182
  2. OAuth flow provides profile with token
183
+ 3. System validates OAuth token for Qwen model access
184
+ 4. If sufficient scopes: Initialize QwenClient with LangGraph workflow
185
+ 5. If insufficient scopes: Show clear error message with guidance
186
+ 6. System either works fully or fails clearly - no degraded modes
187
  ```
188
 
189
+ #### OAuth Requirements ⚠️
190
 
191
+ **CRITICAL**: Gradio OAuth tokens often have **limited scopes** by default:
192
  - ✅ **"read" scope**: Can access user profile, model info
193
+ - ❌ **"inference" scope**: Often missing - REQUIRED for Qwen models
194
+ - ❌ **"write" scope**: Not needed for this application
195
 
196
  **System Behavior**:
197
+ - **Full-scope token**: Uses Qwen models with LangGraph → 30%+ GAIA performance
198
+ - **Limited-scope token**: Clear error messageUser guided to proper authentication
199
+ - **No token**: Clear error messageUser guided to login
200
 
201
+ **Clear Error Handling**:
202
  ```python
203
+ # No more fallback confusion - clear requirements
 
204
  if test_response.status_code == 401:
205
+ return "Authentication Error: Your OAuth token lacks inference permissions. Please logout and login again with full access."
 
206
  ```
207
 
208
  ### 6. Deployment Process
 
220
  - Ensure `app.py` is the main entry point
221
  - Include all dependencies in `requirements.txt`
222
 
223
+ 3. **Test Authentication**:
224
  - Space automatically enables OAuth for Gradio apps
225
  - Test login/logout functionality
226
+ - Verify Qwen model access works
227
+ - Test GAIA evaluation with LangGraph workflow
228
 
229
  ### 7. Verification Steps
230
 
 
232
 
233
  - [ ] **Interface Loads**: Gradio interface appears correctly
234
  - [ ] **OAuth Login**: Login button works and shows user profile
235
+ - [ ] **Authentication Check**: Clear error messages when insufficient permissions
236
+ - [ ] **Qwen Model Access**: Models initialize and respond correctly
237
+ - [ ] **LangGraph Workflow**: Multi-agent system processes questions
238
+ - [ ] **Manual Testing**: Individual questions work with full workflow
239
  - [ ] **GAIA Evaluation**: Full evaluation runs and submits to Unit 4 API
240
  - [ ] **Results Display**: Scores and detailed results show correctly
241
 
 
243
 
244
  #### Common Issues
245
 
246
+ **Issue**: "HuggingFace token with inference permissions is required"
247
+ **Solution**: Set HF_TOKEN in Space secrets or login with full OAuth permissions
248
 
249
+ **Issue**: "Failed to initialize any Qwen models"
250
+ **Solution**: Verify HF_TOKEN has inference scope and Qwen model access
251
 
252
+ **Issue**: "Authentication Error: Your OAuth token lacks inference permissions"
253
+ **Solution**: Logout and login again, or set HF_TOKEN as Space secret
254
 
255
  #### Debug Commands
256
 
257
  ```python
258
+ # In Space, add debug logging to check authentication:
259
+ logger.info(f"HF_TOKEN available: {os.getenv('HF_TOKEN') is not None}")
260
  logger.info(f"OAuth token available: {oauth_token is not None}")
261
+ logger.info(f"Qwen models initialized: {client.get_model_status()}")
262
  ```
263
 
264
  ### 9. Performance Optimization
265
 
266
+ For production efficiency with Qwen models:
267
 
268
  ```python
269
+ # Intelligent Model Selection Strategy
270
+ - Simple questions: Qwen 2.5-7B (fast, cost-effective)
271
+ - Medium complexity: Qwen 2.5-32B (balanced performance)
272
+ - Complex reasoning: Qwen 2.5-72B (best quality)
273
  - Budget management: Auto-downgrade when budget exceeded
274
+ - LangGraph workflow: Optimal agent routing and synthesis
275
  ```
276
 
277
  ### 10. Monitoring and Maintenance
278
 
279
  **Key Metrics to Monitor**:
280
 
281
+ - GAIA benchmark success rate (target: 30%+)
282
  - Average response time per question
283
  - Cost per question processed
284
+ - LangGraph workflow success rate
285
+ - Qwen model availability and performance
286
 
287
  **Regular Maintenance**:
288
 
289
+ - Monitor HuggingFace Inference API status
290
  - Update dependencies for security
291
+ - Review and optimize LangGraph workflow performance
292
  - Check Unit 4 API compatibility
293
+ - Monitor Qwen model performance and costs
294
 
295
+ ## 🎯 Success Metrics
296
 
297
+ ### Expected Production Results 🚀
298
 
299
+ With proper deployment and authentication:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300
 
301
  - **GAIA Benchmark**: 30%+ success rate
302
+ - **LangGraph Workflow**: Multi-agent orchestration working
303
+ - **Qwen Model Performance**: Intelligent tier selection (7B→32B→72B)
304
+ - **User Experience**: Professional interface with clear authentication
305
+ - **System Reliability**: Clear success/failure modes (no degraded performance)
 
 
 
 
 
 
 
 
 
 
306
 
307
  ### Final Status:
308
+ - **Architecture**: Qwen 2.5 models + LangGraph multi-agent workflow
309
+ - **Requirements**: Clear authentication requirements (HF_TOKEN or OAuth with inference)
310
+ - **Performance**: 30%+ GAIA benchmark with full functionality
311
+ - **Reliability**: Robust error handling with clear user guidance
312
  - **Deployment**: Ready for immediate HuggingFace Space deployment
313
 
314
+ **The GAIA Agent is now a focused, high-performance system using proper AI models and multi-agent orchestration!** 🎉