Spaces:

anthonym21
/

ask-my-research

Running

App Files Files Community

anthonym21 commited on Jan 29

Commit

2ce4d6f

1 Parent(s): c3f6521

Switch to HF Inference API - no ZeroGPU needed

Browse files

Files changed (3) hide show

README.md +0 -1
app.py +28 -38
requirements.txt +3 -6

README.md CHANGED Viewed

@@ -9,7 +9,6 @@ app_file: app.py
 pinned: false
 license: mit
 short_description: Chat with Anthony Maio's AI safety research papers
-hardware: zero-a10g
 ---
 # Ask My Research

 pinned: false
 license: mit
 short_description: Chat with Anthony Maio's AI safety research papers
 ---
 # Ask My Research

app.py CHANGED Viewed

@@ -1,20 +1,19 @@
 """
 Ask My Research - RAG chatbot over Anthony Maio's AI safety papers.
-Runs on HuggingFace Spaces with ZeroGPU.
 """
 import json
 import time
 from pathlib import Path
 from collections import defaultdict
 import gradio as gr
 import numpy as np
-import spaces
-import torch
-import faiss
 from sentence_transformers import SentenceTransformer
-from transformers import AutoModelForCausalLM, AutoTokenizer
 # =============================================================================
 # Configuration
@@ -102,10 +101,15 @@ else:
     faiss_index = None
     chunks = []
-print("Loading LLM tokenizer...")
-tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
-if tokenizer.pad_token is None:
-    tokenizer.pad_token = tokenizer.eos_token
 # =============================================================================
 # RAG Functions
@@ -179,20 +183,11 @@ def format_citations(retrieved_chunks: list[dict]) -> str:
 # =============================================================================
-# Generation with ZeroGPU
 # =============================================================================
-@spaces.GPU(duration=120)
 def generate_response(query: str, context: str) -> str:
-    """Generate response using the LLM with ZeroGPU."""
-    # Load model on GPU
-    model = AutoModelForCausalLM.from_pretrained(
-        LLM_MODEL,
-        torch_dtype=torch.float16,
-        device_map="auto",
-        trust_remote_code=True
-    )
     # Build prompt
     system_prompt = """You are a helpful research assistant that answers questions about Anthony Maio's AI safety research papers.
@@ -211,24 +206,19 @@ Question: {query}
 Provide a helpful answer based ONLY on the context above. If the context doesn't contain relevant information, say so."""
-    messages = [
-        {"role": "user", "content": f"{system_prompt}\n\n{user_prompt}"}
-    ]
-    # Generate
-    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)
-    with torch.no_grad():
-        outputs = model.generate(
-            inputs,
-            max_new_tokens=MAX_NEW_TOKENS,
-            do_sample=True,
-            temperature=0.7,
-            top_p=0.9,
-            pad_token_id=tokenizer.pad_token_id,
-        )
-    response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
     return response.strip()
@@ -263,7 +253,7 @@ def chat(message: str, history: list, request: gr.Request) -> str:
     try:
         response = generate_response(message, context)
     except Exception as e:
-        return f"Error generating response: {str(e)}"
     # Add citations
     citations = format_citations(retrieved)

 """
 Ask My Research - RAG chatbot over Anthony Maio's AI safety papers.
+Runs on HuggingFace Spaces using the Inference API.
 """
 import json
+import os
 import time
 from pathlib import Path
 from collections import defaultdict
 import gradio as gr
 import numpy as np
+from huggingface_hub import InferenceClient
 from sentence_transformers import SentenceTransformer
+import faiss
 # =============================================================================
 # Configuration
     faiss_index = None
     chunks = []
+# Initialize the Inference Client
+print("Initializing HF Inference Client...")
+hf_token = os.environ.get("HF_TOKEN")
+if hf_token:
+    client = InferenceClient(token=hf_token)
+    print("Inference client ready with authentication")
+else:
+    client = InferenceClient()
+    print("WARNING: No HF_TOKEN found - using unauthenticated requests")
 # =============================================================================
 # RAG Functions
 # =============================================================================
+# Generation with Inference API
 # =============================================================================
 def generate_response(query: str, context: str) -> str:
+    """Generate response using the HF Inference API."""
     # Build prompt
     system_prompt = """You are a helpful research assistant that answers questions about Anthony Maio's AI safety research papers.
 Provide a helpful answer based ONLY on the context above. If the context doesn't contain relevant information, say so."""
+    # Format for Mistral instruction format
+    prompt = f"<s>[INST] {system_prompt}\n\n{user_prompt} [/INST]"
+    # Call the Inference API
+    response = client.text_generation(
+        prompt,
+        model=LLM_MODEL,
+        max_new_tokens=MAX_NEW_TOKENS,
+        temperature=0.7,
+        top_p=0.9,
+        repetition_penalty=1.1,
+    )
     return response.strip()
     try:
         response = generate_response(message, context)
     except Exception as e:
+        return f"Error generating response: {type(e).__name__}: {str(e)}"
     # Add citations
     citations = format_citations(retrieved)

requirements.txt CHANGED Viewed

@@ -1,8 +1,5 @@
-gradio>=4.44.0
-transformers>=4.40.0
-torch>=2.0.0
 sentence-transformers>=2.2.0
 faiss-cpu>=1.7.4
-PyMuPDF>=1.23.0
-accelerate>=0.27.0
-spaces>=0.28.0

+gradio>=5.0.0
+huggingface_hub>=0.20.0
 sentence-transformers>=2.2.0
 faiss-cpu>=1.7.4
+numpy