Spaces:

Danaasa
/

presAI

Runtime error

App Files Files Community

Danaasa commited on Apr 9, 2025

Commit

33b8d0a

verified ·

1 Parent(s): 6a414d8

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -37

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse
@@ -13,11 +14,8 @@ import os
 from huggingface_hub import login
 from peft import PeftModel, PeftConfig
-# Create FastAPI app
 app = FastAPI()
-# CORS middleware setup
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -26,40 +24,30 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# Pydantic models
 class ChatRequest(BaseModel):
     message: str
-    history: list = []  # List of [user_msg, assistant_msg] pairs
 class ChatResponse(BaseModel):
     response: str
-# Load model and tokenizer
-from peft import PeftModel, PeftConfig
 def load_model_and_tokenizer(base_model_name="mistralai/Mistral-7B-Instruct-v0.3", adapter_name="Danaasa/bible_mistral"):
-    # Get the Hugging Face token from environment variable
     hf_token = os.environ.get("HUGGING_FACE_HUB_TOKEN")
-    # Log in with the token if available
     if hf_token:
         login(token=hf_token)
         print("Successfully logged in with Hugging Face token")
     else:
         print("No Hugging Face token found in environment variables")
-    # Load tokenizer with token for authentication
     tokenizer = AutoTokenizer.from_pretrained(
         base_model_name,
         trust_remote_code=True,
-        token=hf_token  # Pass token here
     )
     if tokenizer.pad_token_id is None:
         tokenizer.pad_token_id = tokenizer.eos_token_id
-    # Set up quantization
     quantization_config = BitsAndBytesConfig(
         load_in_4bit=True,
         bnb_4bit_quant_type="nf4",
@@ -67,28 +55,25 @@ def load_model_and_tokenizer(base_model_name="mistralai/Mistral-7B-Instruct-v0.3
         bnb_4bit_compute_dtype=torch.float16
     )
-    # Load the base model with token for authentication
     base_model = AutoModelForCausalLM.from_pretrained(
         base_model_name,
         quantization_config=quantization_config,
         device_map="auto",
         trust_remote_code=True,
-        token=hf_token  # Pass token here
     )
-    # Load the adapter with token for authentication
     model = PeftModel.from_pretrained(
         base_model,
         adapter_name,
-        token=hf_token  # Pass token here
     )
     model.eval()
     return model, tokenizer
-# Global variables for model and tokenizer
 model, tokenizer = load_model_and_tokenizer()
-# Response generator
 def generate_response(question, conversation_history, model, tokenizer):
     system_prompt = """
     - You are a truthful Christian AI assistant.
@@ -105,9 +90,7 @@ def generate_response(question, conversation_history, model, tokenizer):
     input_text = f"[INST] {system_prompt} [/INST]\n"
-    # Add conversation history if available
     if conversation_history:
-        # Use the last 3 exchanges for context (can adjust as needed)
         recent_history = conversation_history[-3:]
         input_text += "Previous context (for reference only, do not repeat):\n"
         for user_msg, assistant_msg in recent_history:
@@ -133,8 +116,6 @@ def generate_response(question, conversation_history, model, tokenizer):
     try:
         answer = full_response.split("[/INST]")[-1].strip()
-        # Clean known pieces
         if system_prompt in answer:
             answer = answer.replace(system_prompt, "").strip()
         if "Previous context" in answer:
@@ -143,13 +124,10 @@ def generate_response(question, conversation_history, model, tokenizer):
             answer = answer.split("Current question")[-1].strip()
         if question in answer[:len(question) + 10]:
             answer = answer.split(question)[-1].strip()
         if answer.startswith(("The assistant", "*The assistant")):
             answer = answer.split(".", 1)[-1].strip() if "." in answer else answer
         if answer.startswith('"') and answer.endswith('"'):
             answer = answer[1:-1].strip()
     except IndexError:
         print(f"Warning: Parsing failed, raw response: {full_response}")
         answer = full_response
@@ -159,22 +137,18 @@ def generate_response(question, conversation_history, model, tokenizer):
     for word in words:
         current_response += word + " "
         yield current_response.strip()
-        time.sleep(0.05)  # This controls typing speed
-# Stream response to client
 async def stream_response(message: str, conversation_history: List[Tuple[str, str]]):
     for response_chunk in generate_response(message, conversation_history, model, tokenizer):
-        # Send each chunk as a server-sent event
         yield f"data: {json.dumps({'text': response_chunk})}\n\n"
-        await asyncio.sleep(0.05)  # Small delay to control flow
 @app.post("/chat")
 async def chat(request: ChatRequest):
     message = request.message
-    # Process conversation history safely
     try:
-        # Make sure each history item has exactly two elements (user_msg, assistant_msg)
         conversation_history = [
             (h[0], h[1]) for h in request.history
             if isinstance(h, list) and len(h) >= 2
@@ -183,7 +157,6 @@ async def chat(request: ChatRequest):
         print(f"Error processing history: {e}")
         conversation_history = []
-    # Return a streaming response
     return StreamingResponse(
         stream_response(message, conversation_history),
         media_type="text/event-stream"
@@ -191,7 +164,6 @@ async def chat(request: ChatRequest):
 @app.post("/chat-full", response_model=ChatResponse)
 async def chat_full(request: ChatRequest):
-    """Non-streaming endpoint as fallback"""
     message = request.message
     try:
@@ -203,7 +175,6 @@ async def chat_full(request: ChatRequest):
         print(f"Error processing history: {e}")
         conversation_history = []
-    # Generate complete response
     response_text = ""
     for partial in generate_response(message, conversation_history, model, tokenizer):
         response_text = partial

+# main.py (your code, unchanged except for the port in the CMD of the Dockerfile)
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse
 from huggingface_hub import login
 from peft import PeftModel, PeftConfig
 app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
 class ChatRequest(BaseModel):
     message: str
+    history: list = []
 class ChatResponse(BaseModel):
     response: str
 def load_model_and_tokenizer(base_model_name="mistralai/Mistral-7B-Instruct-v0.3", adapter_name="Danaasa/bible_mistral"):
     hf_token = os.environ.get("HUGGING_FACE_HUB_TOKEN")
     if hf_token:
         login(token=hf_token)
         print("Successfully logged in with Hugging Face token")
     else:
         print("No Hugging Face token found in environment variables")
     tokenizer = AutoTokenizer.from_pretrained(
         base_model_name,
         trust_remote_code=True,
+        token=hf_token
     )
     if tokenizer.pad_token_id is None:
         tokenizer.pad_token_id = tokenizer.eos_token_id
     quantization_config = BitsAndBytesConfig(
         load_in_4bit=True,
         bnb_4bit_quant_type="nf4",
         bnb_4bit_compute_dtype=torch.float16
     )
     base_model = AutoModelForCausalLM.from_pretrained(
         base_model_name,
         quantization_config=quantization_config,
         device_map="auto",
         trust_remote_code=True,
+        token=hf_token
     )
     model = PeftModel.from_pretrained(
         base_model,
         adapter_name,
+        token=hf_token
     )
     model.eval()
     return model, tokenizer
 model, tokenizer = load_model_and_tokenizer()
 def generate_response(question, conversation_history, model, tokenizer):
     system_prompt = """
     - You are a truthful Christian AI assistant.
     input_text = f"[INST] {system_prompt} [/INST]\n"
     if conversation_history:
         recent_history = conversation_history[-3:]
         input_text += "Previous context (for reference only, do not repeat):\n"
         for user_msg, assistant_msg in recent_history:
     try:
         answer = full_response.split("[/INST]")[-1].strip()
         if system_prompt in answer:
             answer = answer.replace(system_prompt, "").strip()
         if "Previous context" in answer:
             answer = answer.split("Current question")[-1].strip()
         if question in answer[:len(question) + 10]:
             answer = answer.split(question)[-1].strip()
         if answer.startswith(("The assistant", "*The assistant")):
             answer = answer.split(".", 1)[-1].strip() if "." in answer else answer
         if answer.startswith('"') and answer.endswith('"'):
             answer = answer[1:-1].strip()
     except IndexError:
         print(f"Warning: Parsing failed, raw response: {full_response}")
         answer = full_response
     for word in words:
         current_response += word + " "
         yield current_response.strip()
+        time.sleep(0.05)
 async def stream_response(message: str, conversation_history: List[Tuple[str, str]]):
     for response_chunk in generate_response(message, conversation_history, model, tokenizer):
         yield f"data: {json.dumps({'text': response_chunk})}\n\n"
+        await asyncio.sleep(0.05)
 @app.post("/chat")
 async def chat(request: ChatRequest):
     message = request.message
     try:
         conversation_history = [
             (h[0], h[1]) for h in request.history
             if isinstance(h, list) and len(h) >= 2
         print(f"Error processing history: {e}")
         conversation_history = []
     return StreamingResponse(
         stream_response(message, conversation_history),
         media_type="text/event-stream"
 @app.post("/chat-full", response_model=ChatResponse)
 async def chat_full(request: ChatRequest):
     message = request.message
     try:
         print(f"Error processing history: {e}")
         conversation_history = []
     response_text = ""
     for partial in generate_response(message, conversation_history, model, tokenizer):
         response_text = partial