Spaces:

PD03
/

ZeroGPU_LLAMA

Sleeping

App Files Files Community

PD03 commited on Jul 24, 2025

Commit

393aa68

verified ·

1 Parent(s): fa59ff8

Update app.py

Browse files

Files changed (1) hide show

app.py +135 -108

app.py CHANGED Viewed

@@ -1,12 +1,11 @@
-# Setup Hugging Face Transformers for LLAMA3
 import os
 import requests
 import json
 import gradio as gr
 from typing import List, Dict, Any, Optional
 import logging
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-import torch
 # Setup logging
 logging.basicConfig(level=logging.INFO)
@@ -14,100 +13,111 @@ logger = logging.getLogger(__name__)
 # Configuration - Set these as environment variables in Hugging Face Spaces
 SAP_API_KEY = os.getenv('SAP_API_KEY')  # Set in Space secrets
-HF_TOKEN = os.getenv('HF_API_TOKEN')  # Set in Space secrets for private models
 SAP_BASE_URL = "https://sandbox.api.sap.com/s4hanacloud/sap/opu/odata/sap"
-# Initialize LLAMA3 model
-MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"  # or "meta-llama/Meta-Llama-3-70B-Instruct" for larger model
 class LLAMA3Client:
-    def __init__(self):
         try:
-            # Initialize tokenizer and model
-            logger.info("Loading LLAMA3 model...")
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                MODEL_NAME,
-                token=HF_TOKEN,
-                trust_remote_code=True
-            )
-            # Use GPU if available
-            device = "cuda" if torch.cuda.is_available() else "cpu"
-            logger.info(f"Using device: {device}")
-            self.model = AutoModelForCausalLM.from_pretrained(
-                MODEL_NAME,
-                token=HF_TOKEN,
-                torch_dtype=torch.float16 if device == "cuda" else torch.float32,
-                device_map="auto" if device == "cuda" else None,
-                trust_remote_code=True,
-                low_cpu_mem_usage=True
-            )
-            # Create text generation pipeline
-            self.generator = pipeline(
-                "text-generation",
-                model=self.model,
-                tokenizer=self.tokenizer,
-                torch_dtype=torch.float16 if device == "cuda" else torch.float32,
-                device_map="auto" if device == "cuda" else None
-            )
-            logger.info("LLAMA3 model loaded successfully")
         except Exception as e:
-            logger.error(f"Error loading LLAMA3 model: {e}")
-            # Fallback to smaller model or API-based approach
             try:
-                self.generator = pipeline(
-                    "text-generation",
-                    model="microsoft/DialoGPT-medium",
-                    tokenizer="microsoft/DialoGPT-medium"
                 )
-                logger.info("Fallback model loaded")
-            except:
-                self.generator = None
-                logger.error("Failed to load any model")
-    def generate_response(self, prompt: str, max_length: int = 1000, temperature: float = 0.1) -> str:
-        """Generate response using LLAMA3"""
-        if not self.generator:
-            return "Model not available. Please check configuration."
-        try:
-            # Format prompt for LLAMA3 instruction format
-            formatted_prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
-You are a helpful SAP data analyst. Provide clear, concise answers based on the provided data.<|eot_id|><|start_header_id|>user<|end_header_id|>
 {prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 """
-            # Generate response
-            outputs = self.generator(
-                formatted_prompt,
-                max_length=max_length,
-                temperature=temperature,
-                do_sample=True,
-                top_p=0.9,
-                num_return_sequences=1,
-                pad_token_id=self.tokenizer.eos_token_id,
-                eos_token_id=self.tokenizer.eos_token_id
             )
-            # Extract generated text
-            generated_text = outputs[0]['generated_text']
-            # Extract only the assistant's response
-            if "<|start_header_id|>assistant<|end_header_id|>" in generated_text:
-                response = generated_text.split("<|start_header_id|>assistant<|end_header_id|>")[-1]
-                response = response.replace("<|eot_id|>", "").strip()
             else:
-                response = generated_text[len(formatted_prompt):].strip()
-            return response if response else "I couldn't generate a proper response. Please try rephrasing your question."
         except Exception as e:
             logger.error(f"Error generating response: {e}")
             return f"I encountered an error while processing your question: {str(e)}"
@@ -137,7 +147,7 @@ class SAPDataFetcher:
             logger.error(f"JSON decode error: {e}")
             return None
-    def fetch_sales_orders(self, top: int = 50) -> List[Dict]:
         """Fetch sales orders with error handling"""
         url = f"{SAP_BASE_URL}/API_SALES_ORDER_SRV/A_SalesOrder?$top={top}&$inlinecount=allpages"
         data = self._make_request(url)
@@ -163,7 +173,7 @@ class SAPDataFetcher:
             logger.error("Failed to fetch sales orders or invalid response format")
             return []
-    def fetch_purchase_orders(self, top: int = 50) -> List[Dict]:
         """Fetch purchase order headers"""
         url = f"{SAP_BASE_URL}/API_PURCHASEORDER_PROCESS_SRV/A_PurchaseOrder?$top={top}&$inlinecount=allpages"
         data = self._make_request(url)
@@ -195,7 +205,7 @@ class SAPDataFetcher:
         """Fetch purchase order items for given order numbers"""
         all_items = []
-        for po_number in purchase_orders[:10]:  # Limit to first 10 to avoid timeout
             url = f"{SAP_BASE_URL}/API_PURCHASEORDER_PROCESS_SRV/A_PurchaseOrderItem?$filter=PurchaseOrder eq '{po_number}'"
             data = self._make_request(url)
@@ -281,7 +291,7 @@ Answer only "yes" or "no" """
             # Check if item details are needed
             if self.needs_item_details(question) and po_headers:
                 logger.info("Fetching item-level details")
-                po_numbers = [po["PurchaseOrder"] for po in po_headers if po["PurchaseOrder"]]
                 po_items = self.data_fetcher.fetch_purchase_order_items(po_numbers)
                 context["items"] = po_items
                 data_type = "Purchase Orders with Item Details"
@@ -302,10 +312,10 @@ Answer only "yes" or "no" """
     def generate_response(self, question: str, context: Dict, data_type: str) -> str:
         """Generate response using LLAMA3"""
-        # Limit context size to prevent token overflow
         context_str = json.dumps(context, indent=2)
-        if len(context_str) > 4000:  # Smaller limit for LLAMA3
-            context_str = context_str[:4000] + "... (truncated)"
         prompt = f"""Data Type: {data_type}
@@ -319,24 +329,28 @@ Instructions:
 2. Include specific numbers, dates, or values when relevant
 3. If the data doesn't contain enough information to answer fully, mention this
 4. Format your response in a user-friendly way
-5. If there are multiple records, summarize key insights"""
         try:
-            return self.llama_client.generate_response(prompt, max_length=800, temperature=0.1)
         except Exception as e:
             logger.error(f"Error generating response: {e}")
             return f"I encountered an error while processing your question: {str(e)}"
 # Initialize the system
 try:
-    llama_client = LLAMA3Client()
-    if SAP_API_KEY:
-        data_fetcher = SAPDataFetcher(SAP_API_KEY)
-        sap_agent = SAPAgent(data_fetcher, llama_client)
-        logger.info("SAP Agent initialized successfully")
-    else:
-        logger.warning("SAP_API_KEY not found. Demo mode enabled.")
         sap_agent = None
 except Exception as e:
     logger.error(f"Failed to initialize SAP Agent: {e}")
     sap_agent = None
@@ -345,21 +359,30 @@ except Exception as e:
 def chat_with_sap(message, history):
     """Handle chat interactions"""
     if not sap_agent:
-        return history + [("System", "SAP Agent not initialized. Please check your API key configuration in Space secrets.")]
     if not message.strip():
         return history
     try:
-        response = sap_agent.process_query(message)
         history = history or []
-        history.append((message, response))
-        return history
     except Exception as e:
         error_msg = f"Error processing your request: {str(e)}"
         history = history or []
-        history.append((message, error_msg))
-        return history
 def clear_chat():
     return []
@@ -367,20 +390,24 @@ def clear_chat():
 # Create Gradio interface
 with gr.Blocks(title="SAP Order Analytics Agent with LLAMA3") as demo:
     gr.Markdown("""
-    # 🚀 SAP Order Analytics Agent (Powered by LLAMA3)
-    This AI agent uses Meta's LLAMA3 model to help you analyze SAP Sales and Purchase Orders. Ask questions like:
     - "How many sales orders do we have?"
     - "What's the total value of all purchase orders?"
-    - "Show me recent purchase orders from supplier X"
-    - "What are the top materials by quantity?"
-    **Note:** Make sure to set your `SAP_API_KEY` and `HF_TOKEN` in the Space secrets.
     """)
     chatbot = gr.Chatbot(
         height=500,
-        placeholder="Ask me anything about your SAP orders..."
     )
     with gr.Row():
@@ -403,4 +430,4 @@ with gr.Blocks(title="SAP Order Analytics Agent with LLAMA3") as demo:
 # Launch the interface
 if __name__ == "__main__":
-    demo.launch()

+# Setup Hugging Face Inference API for LLAMA3
 import os
 import requests
 import json
 import gradio as gr
 from typing import List, Dict, Any, Optional
 import logging
+import time
 # Setup logging
 logging.basicConfig(level=logging.INFO)
 # Configuration - Set these as environment variables in Hugging Face Spaces
 SAP_API_KEY = os.getenv('SAP_API_KEY')  # Set in Space secrets
+HF_TOKEN = os.getenv('HF_TOKEN')  # Set in Space secrets
 SAP_BASE_URL = "https://sandbox.api.sap.com/s4hanacloud/sap/opu/odata/sap"
+# Hugging Face Inference API endpoints
+HF_API_BASE = "https://api-inference.huggingface.co/models"
+LLAMA3_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct"  # Using inference API
 class LLAMA3Client:
+    def __init__(self, hf_token: str):
+        self.hf_token = hf_token
+        self.api_url = f"{HF_API_BASE}/{LLAMA3_MODEL}"
+        self.headers = {
+            "Authorization": f"Bearer {hf_token}",
+            "Content-Type": "application/json"
+        }
+        # Warm up the model
+        self._warm_up_model()
+    def _warm_up_model(self):
+        """Warm up the model to avoid cold start delays"""
         try:
+            logger.info("Warming up LLAMA3 model...")
+            self._make_inference_request("Hello", max_new_tokens=10)
+            logger.info("Model warmed up successfully")
         except Exception as e:
+            logger.warning(f"Model warm-up failed: {e}")
+    def _make_inference_request(self, prompt: str, max_new_tokens: int = 500, temperature: float = 0.1, max_retries: int = 3) -> str:
+        """Make inference request to Hugging Face API with retry logic"""
+        payload = {
+            "inputs": prompt,
+            "parameters": {
+                "max_new_tokens": max_new_tokens,
+                "temperature": temperature,
+                "do_sample": True,
+                "top_p": 0.9,
+                "return_full_text": False
+            }
+        }
+        for attempt in range(max_retries):
             try:
+                response = requests.post(
+                    self.api_url,
+                    headers=self.headers,
+                    json=payload,
+                    timeout=60
                 )
+                if response.status_code == 503:
+                    # Model is loading, wait and retry
+                    wait_time = min(20 * (attempt + 1), 60)
+                    logger.info(f"Model loading, waiting {wait_time}s...")
+                    time.sleep(wait_time)
+                    continue
+                response.raise_for_status()
+                result = response.json()
+                if isinstance(result, list) and len(result) > 0:
+                    return result[0].get('generated_text', '').strip()
+                elif isinstance(result, dict) and 'generated_text' in result:
+                    return result['generated_text'].strip()
+                else:
+                    logger.error(f"Unexpected response format: {result}")
+                    return "I received an unexpected response format."
+            except requests.exceptions.RequestException as e:
+                logger.error(f"Request failed (attempt {attempt + 1}): {e}")
+                if attempt == max_retries - 1:
+                    return f"Failed to get response after {max_retries} attempts: {str(e)}"
+                time.sleep(2 ** attempt)  # Exponential backoff
+            except Exception as e:
+                logger.error(f"Unexpected error: {e}")
+                return f"An unexpected error occurred: {str(e)}"
+        return "Failed to generate response"
+    def generate_response(self, prompt: str, max_length: int = 500, temperature: float = 0.1) -> str:
+        """Generate response using LLAMA3 via Inference API"""
+        # Format prompt for LLAMA3 instruction format
+        formatted_prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+You are a helpful SAP data analyst. Provide clear, concise answers based on the provided data. Keep responses under 300 words.<|eot_id|><|start_header_id|>user<|end_header_id|>
 {prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 """
+        try:
+            response = self._make_inference_request(
+                formatted_prompt,
+                max_new_tokens=min(max_length, 400),  # Limit tokens to avoid timeouts
+                temperature=temperature
             )
+            # Clean up the response
+            if response and len(response.strip()) > 0:
+                return response
             else:
+                return "I couldn't generate a proper response. Please try rephrasing your question."
         except Exception as e:
             logger.error(f"Error generating response: {e}")
             return f"I encountered an error while processing your question: {str(e)}"
             logger.error(f"JSON decode error: {e}")
             return None
+    def fetch_sales_orders(self, top: int = 30) -> List[Dict]:
         """Fetch sales orders with error handling"""
         url = f"{SAP_BASE_URL}/API_SALES_ORDER_SRV/A_SalesOrder?$top={top}&$inlinecount=allpages"
         data = self._make_request(url)
             logger.error("Failed to fetch sales orders or invalid response format")
             return []
+    def fetch_purchase_orders(self, top: int = 30) -> List[Dict]:
         """Fetch purchase order headers"""
         url = f"{SAP_BASE_URL}/API_PURCHASEORDER_PROCESS_SRV/A_PurchaseOrder?$top={top}&$inlinecount=allpages"
         data = self._make_request(url)
         """Fetch purchase order items for given order numbers"""
         all_items = []
+        for po_number in purchase_orders[:5]:  # Reduced limit for faster processing
             url = f"{SAP_BASE_URL}/API_PURCHASEORDER_PROCESS_SRV/A_PurchaseOrderItem?$filter=PurchaseOrder eq '{po_number}'"
             data = self._make_request(url)
             # Check if item details are needed
             if self.needs_item_details(question) and po_headers:
                 logger.info("Fetching item-level details")
+                po_numbers = [po["PurchaseOrder"] for po in po_headers[:5] if po["PurchaseOrder"]]  # Limit for performance
                 po_items = self.data_fetcher.fetch_purchase_order_items(po_numbers)
                 context["items"] = po_items
                 data_type = "Purchase Orders with Item Details"
     def generate_response(self, question: str, context: Dict, data_type: str) -> str:
         """Generate response using LLAMA3"""
+        # Limit context size for API efficiency
         context_str = json.dumps(context, indent=2)
+        if len(context_str) > 2000:  # Smaller limit for API
+            context_str = context_str[:2000] + "... (truncated)"
         prompt = f"""Data Type: {data_type}
 2. Include specific numbers, dates, or values when relevant
 3. If the data doesn't contain enough information to answer fully, mention this
 4. Format your response in a user-friendly way
+5. Keep response under 250 words"""
         try:
+            return self.llama_client.generate_response(prompt, max_length=400, temperature=0.1)
         except Exception as e:
             logger.error(f"Error generating response: {e}")
             return f"I encountered an error while processing your question: {str(e)}"
 # Initialize the system
 try:
+    if not HF_TOKEN:
+        logger.error("HF_TOKEN not found in environment variables")
         sap_agent = None
+    else:
+        llama_client = LLAMA3Client(HF_TOKEN)
+        if SAP_API_KEY:
+            data_fetcher = SAPDataFetcher(SAP_API_KEY)
+            sap_agent = SAPAgent(data_fetcher, llama_client)
+            logger.info("SAP Agent initialized successfully")
+        else:
+            logger.warning("SAP_API_KEY not found. Demo mode enabled.")
+            sap_agent = None
 except Exception as e:
     logger.error(f"Failed to initialize SAP Agent: {e}")
     sap_agent = None
 def chat_with_sap(message, history):
     """Handle chat interactions"""
     if not sap_agent:
+        return history + [("System", "SAP Agent not initialized. Please check your HF_TOKEN and SAP_API_KEY in Space secrets.")]
     if not message.strip():
         return history
     try:
+        # Add typing indicator
         history = history or []
+        history.append((message, "🤔 Thinking..."))
+        yield history
+        # Process the query
+        response = sap_agent.process_query(message)
+        history[-1] = (message, response)
+        yield history
     except Exception as e:
         error_msg = f"Error processing your request: {str(e)}"
         history = history or []
+        if history and history[-1][1] == "🤔 Thinking...":
+            history[-1] = (message, error_msg)
+        else:
+            history.append((message, error_msg))
+        yield history
 def clear_chat():
     return []
 # Create Gradio interface
 with gr.Blocks(title="SAP Order Analytics Agent with LLAMA3") as demo:
     gr.Markdown("""
+    # 🚀 SAP Order Analytics Agent (Powered by LLAMA3 via Inference API)
+    This AI agent uses Meta's LLAMA3 model via Hugging Face Inference API to analyze SAP data. Ask questions like:
     - "How many sales orders do we have?"
     - "What's the total value of all purchase orders?"
+    - "Show me recent purchase orders"
+    - "What are the top suppliers?"
+    **Setup Required:**
+    1. Set `HF_TOKEN` in Space secrets (your Hugging Face token)
+    2. Set `SAP_API_KEY` in Space secrets (your SAP API key)
+    3. Ensure you have access to LLAMA3 model on Hugging Face
     """)
     chatbot = gr.Chatbot(
         height=500,
+        placeholder="Ask me anything about your SAP orders...",
+        show_copy_button=True
     )
     with gr.Row():
 # Launch the interface
 if __name__ == "__main__":
+    demo.launch(share=True)