Spaces:

hmm183
/

LLM

Runtime error

App Files Files Community

hmm183 commited on Jun 4, 2025

Commit

b44200d

verified ·

1 Parent(s): 6abeebb

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -254

app.py CHANGED Viewed

@@ -1,267 +1,104 @@
 import os
-import requests # Used for checking Ollama connection in a commented-out section, can be removed if not needed.
-# --- IMPORTANT: Set a writable cache directory for Hugging Face models ---
-# This is crucial for environments like Hugging Face Spaces where default cache locations
-# might not be writable or persistent. /tmp is usually writable.
-# HF_HOME is the preferred environment variable for Hugging Face cache.
-os.environ["HF_HOME"] = "/tmp/huggingface_cache"
-# Ensure the directory exists
-os.makedirs(os.environ["HF_HOME"], exist_ok=True)
-# --- Flask and CORS ---
-from flask import Flask, request, jsonify
-from flask_cors import CORS
-# --- LangChain and Hugging Face Libraries ---
-# Note: We are NOT using Ollama directly in this app.py for Hugging Face Spaces.
-# Instead, we are loading models directly via Hugging Face's transformers library.
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-import torch # For checking GPU availability and model dtype
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import Chroma
 from langchain_core.documents import Document
-from langchain_core.prompts import ChatPromptTemplate
-from langchain_core.output_parsers import StrOutputParser
-from langchain_core.runnables import RunnablePassthrough
 from langchain_text_splitters import RecursiveCharacterTextSplitter
-app = Flask(__name__)
-CORS(app) # Allow all origins (good for development/MVP on HF Spaces)
-# --- Model Configuration for Hugging Face Transformers ---
-# These models will be downloaded directly by the 'transformers' library.
-# 'google/gemma-2b-it' is chosen for its size and instruction-following capabilities.
-# 'sentence-transformers/all-MiniLM-L6-v2' is a small, efficient embedding model.
-LLM_MODEL_NAME_HF = "google/gemma-2b-it"
-EMBEDDING_MODEL_NAME_HF = "sentence-transformers/all-MiniLM-L6-v2"
-# Global variables for models
-llm_pipeline = None # Will hold the Hugging Face text-generation pipeline
-embeddings = None   # Will hold the HuggingFaceEmbeddings instance
-# --- User-specific Vector Stores Cache ---
-# This dictionary will hold Chroma instances, keyed by user_id.
-# IMPORTANT MVP LIMITATION: This is an in-memory cache.
-# - If the app restarts, all loaded user contexts are lost from memory (though
-#   Chroma data is saved to disk in `chroma_db_users`).
-# - For true concurrency and persistence, you'd load from disk on demand or use an external DB.
-user_vectorstores = {}
-def initialize_models():
-    """
-    Initialize Hugging Face models (LLM pipeline and Embeddings).
-    This function is called once when the Flask app starts.
-    """
-    global llm_pipeline, embeddings
-    print("Initializing Hugging Face models...")
-    try:
-        # Determine device for LLM: Use GPU if available, otherwise CPU
-        # On Hugging Face Spaces free tier, it's usually CPU (-1).
-        device = 0 if torch.cuda.is_available() else -1
-        print(f"Using device for LLM: {'cuda' if device == 0 else 'cpu'}")
-        # --- Initialize LLM Pipeline (google/gemma-2b-it) ---
-        print(f"Loading LLM: {LLM_MODEL_NAME_HF}...")
-        # AutoTokenizer and AutoModelForCausalLM will use HF_HOME for caching.
-        tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME_HF)
-        # Use bfloat16 for GPU if available to save memory, otherwise float32 for CPU.
-        model = AutoModelForCausalLM.from_pretrained(
-            LLM_MODEL_NAME_HF,
-            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
-        )
-        llm_pipeline = pipeline(
-            "text-generation",
-            model=model,
-            tokenizer=tokenizer,
-            max_new_tokens=500, # Max tokens for the generated response
-            device=device,      # Use the determined device (CPU or GPU)
-            do_sample=True,     # Enable sampling for more varied responses
-            temperature=0.7,    # Control randomness (lower for more focused, higher for more creative)
-            top_p=0.9,          # Nucleus sampling
-            top_k=50,           # Top-k sampling
-            # Stop sequences for generation to prevent model from continuing beyond the answer
-            # These are crucial for chat models used in RAG.
-            eos_token_id=tokenizer.eos_token_id, # End of sequence token
-            pad_token_id=tokenizer.pad_token_id # Pad token ID
-        )
-        print("LLM Pipeline initialized successfully!")
-        # --- Initialize Hugging Face Embeddings (all-MiniLM-L6-v2) ---
-        print(f"Loading Embedding Model: {EMBEDDING_MODEL_NAME_HF}...")
-        embeddings = HuggingFaceEmbeddings(
-            model_name=EMBEDDING_MODEL_NAME_HF,
-            # Explicitly set cache_folder to ensure it uses the writable directory
-            cache_folder=os.environ["HF_HOME"],
-            # IMPORTANT: local_files_only=True means it will NOT try to download if not found.
-            # If you want it to download if not present, remove this line or set to False.
-            # For robust deployment, pre-caching and uploading the model is recommended.
-            model_kwargs={"local_files_only": False} # Set to False to allow download if not cached
-        )
-        print("Embedding Model initialized successfully!")
-    except Exception as e:
-        print(f"ERROR: An unexpected error occurred during model initialization: {e}")
-        llm_pipeline = None
-        embeddings = None
-        # Re-raise the exception to prevent the Flask app from starting if models fail to load
-        raise e
-@app.route('/load_document', methods=['POST'])
-def load_document():
-    """
-    Load a document for a specific user into their dedicated persistent vector store.
-    The text is chunked for better retrieval.
-    """
-    if not embeddings:
-        return jsonify({"error": "Embedding model not initialized. Server might be restarting or failed to load models."}), 500
-    data = request.get_json()
-    user_id = data.get("user_id") # Expecting a user_id from the client
-    text = data.get("text")
-    if not user_id:
-        return jsonify({"error": "User ID (user_id) is required to load a document."}), 400
-    if not text:
-        return jsonify({"error": "No text provided to load."}), 400
-    print(f"Loading document for user: {user_id}")
-    try:
-        # Create a unique persistence directory for each user's ChromaDB
-        # This will be within the Space's storage, which can be ephemeral on restarts.
-        persist_dir = f"{os.environ['HF_HOME']}/chroma_db_users/{user_id}/"
-        os.makedirs(persist_dir, exist_ok=True)
-        # Wrap the input text in a LangChain Document
-        base_document = Document(page_content=text, metadata={"user_id": user_id, "source": "user_upload"})
-        # Chunk the document for better retrieval performance
-        text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=1000,      # Max characters per chunk
-            chunk_overlap=200,    # Overlap between chunks to maintain context
-            length_function=len,
-            is_separator_regex=False,
-        )
-        chunks = text_splitter.split_documents([base_document])
-        # Create/overwrite the vector store for this specific user
-        # This will save to the user-specific directory on disk.
-        user_vectorstores[user_id] = Chroma.from_documents(
-            chunks, embedding=embeddings, persist_directory=persist_dir
-        )
-        print(f"Document loaded for user '{user_id}'. Chunks created: {len(chunks)} at {persist_dir}")
-        return jsonify({"message": f"Document loaded successfully for user '{user_id}'.", "chunks_created": len(chunks)})
-    except Exception as e:
-        print(f"Error loading document for user '{user_id}': {e}")
-        import traceback
-        traceback.print_exc() # Print full traceback for debugging
-        return jsonify({"error": f"Error loading document: {e}"}), 500
-@app.route('/query', methods=['POST'])
-def query():
-    """
-    Query the currently loaded document for a specific user to summarize or answer a question.
-    """
-    if not llm_pipeline or not embeddings:
-        return jsonify({"error": "Models not initialized. Server might be restarting or failed to load models."}), 500
-    data = request.get_json()
-    user_id = data.get("user_id")
-    query_text = data.get("query")
-    if not user_id:
-        return jsonify({"error": "User ID (user_id) is required to query."}), 400
-    if not query_text:
-        return jsonify({"error": "No query text provided."}), 400
-    print(f"Query received for user: {user_id}, Query: '{query_text}'")
-    # Retrieve the vector store for this specific user from the cache
-    current_user_vectorstore = user_vectorstores.get(user_id)
-    # If not in memory, attempt to load from disk for this user
-    if not current_user_vectorstore:
-        user_persist_dir = f"{os.environ['HF_HOME']}/chroma_db_users/{user_id}/"
-        if os.path.exists(user_persist_dir):
-            try:
-                # Load the existing vectorstore from disk
-                current_user_vectorstore = Chroma(persist_directory=user_persist_dir, embedding_function=embeddings)
-                user_vectorstores[user_id] = current_user_vectorstore # Cache it in memory for subsequent queries
-                print(f"Loaded existing vectorstore for user '{user_id}' from disk.")
-            except Exception as e:
-                print(f"Error loading vectorstore from disk for user '{user_id}': {e}")
-                return jsonify({"error": f"Failed to load document for user '{user_id}'. Please try loading it again or check server logs."}), 500
         else:
-            return jsonify({"error": f"No document loaded for user '{user_id}'. Please load a document first using /load_document."}), 400
-    try:
-        retriever = current_user_vectorstore.as_retriever()
-        # Create a prompt template geared toward Q&A based on context
-        prompt_template = ChatPromptTemplate.from_template(
-            """Answer the question based ONLY on the following context. If the answer is not available in the provided context, politely state that you cannot find the answer in the provided information.
 Context: {context}
-Question: {question}
-"""
-        )
-        # --- RAG Chain for Hugging Face Pipeline ---
-        # Get relevant context documents
-        retrieved_docs = retriever.invoke(query_text)
-        context_text = "\n\n".join([doc.page_content for doc in retrieved_docs])
-        # Format the prompt using the template and retrieved context
-        formatted_prompt = prompt_template.format(context=context_text, question=query_text)
-        # Use the Hugging Face pipeline directly for text generation
-        outputs = llm_pipeline(formatted_prompt)
-        # The output from the pipeline needs to be parsed based on its structure
-        # It's usually a list of dictionaries, with 'generated_text' key.
-        generated_text = outputs[0]['generated_text']
-        # The model might repeat the prompt or parts of it, extract only the new response.
-        # This is a common challenge with text generation.
-        # A simple way is to find the query in the generated text and take what comes after.
-        response_start_index = generated_text.find(formatted_prompt)
-        if response_start_index != -1:
-            response = generated_text[response_start_index + len(formatted_prompt):].strip()
-        else:
-            response = generated_text.strip() # Fallback if prompt isn't found perfectly
-        # Further clean-up to remove any trailing prompt parts the model might generate
-        if response.startswith("Summary:"):
-            response = response[len("Summary:"):].strip()
-        if response.startswith("Answer:"):
-            response = response[len("Answer:"):].strip()
-        if response.startswith("Question:"):
-            response = response[len("Question:"):].strip()
-        if response.startswith("Context:"):
-            response = response[len("Context:"):].strip()
-        print(f"Response generated for user '{user_id}'.")
-        return jsonify({"response": response})
-    except Exception as e:
-        print(f"ERROR: An unexpected error occurred during query for user '{user_id}': {e}")
-        import traceback
-        traceback.print_exc()
-        return jsonify({"error": f"Error processing query: {e}"}), 500
 if __name__ == "__main__":
-    # Call initialization function directly (no Flask debug)
-    initialize_models()
-    print(f"Starting Flask RAG MVP application on http://0.0.0.0:7860 (Hugging Face Spaces default port)")
-    print(f"Using LLM: {LLM_MODEL_NAME_HF}, Embeddings: {EMBEDDING_MODEL_NAME_HF}")
-    print("API endpoints:")
-    print(" - POST /load_document (Requires 'user_id' and 'text')")
-    print(" - POST /query (Requires 'user_id' and 'query')")
-    # Hugging Face Spaces typically runs on port 7860
-    app.run(host="0.0.0.0", port=7860)

+# fastapi_app.py
 import os
+from fastapi import FastAPI, Request
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+import uvicorn
+from typing import Dict
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import Chroma
 from langchain_core.documents import Document
 from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_core.prompts import ChatPromptTemplate
+import asyncio
+# Set HF cache path
+os.environ["TRANSFORMERS_CACHE"] = "./hf_cache"
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# -----------------------------
+# Load models on startup
+# -----------------------------
+LLM_MODEL_NAME = "google/flan-t5-small"  # Lightweight and fast on CPU
+EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+llm_model = None
+llm_tokenizer = None
+embeddings = None
+user_vectorstores: Dict[str, Chroma] = {}
+class LoadDocRequest(BaseModel):
+    user_id: str
+    text: str
+class QueryRequest(BaseModel):
+    user_id: str
+    query: str
+@app.on_event("startup")
+async def load_models():
+    global llm_model, llm_tokenizer, embeddings
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
+    llm_model = AutoModelForCausalLM.from_pretrained(LLM_MODEL_NAME).to(device)
+    embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL_NAME)
+@app.post("/load_document")
+async def load_document(data: LoadDocRequest):
+    user_id = data.user_id
+    text = data.text
+    persist_dir = f"./chroma_db_users/{user_id}/"
+    os.makedirs(persist_dir, exist_ok=True)
+    base_document = Document(page_content=text, metadata={"source": "upload"})
+    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+    chunks = splitter.split_documents([base_document])
+    vectorstore = Chroma.from_documents(chunks, embedding=embeddings, persist_directory=persist_dir)
+    user_vectorstores[user_id] = vectorstore
+    return {"message": f"Loaded {len(chunks)} chunks for user {user_id}"}
+@app.post("/query")
+async def query(data: QueryRequest):
+    user_id = data.user_id
+    query_text = data.query
+    if user_id not in user_vectorstores:
+        persist_dir = f"./chroma_db_users/{user_id}/"
+        if os.path.exists(persist_dir):
+            user_vectorstores[user_id] = Chroma(persist_directory=persist_dir, embedding_function=embeddings)
         else:
+            return {"error": f"No vectorstore found for user {user_id}"}
+    vectorstore = user_vectorstores[user_id]
+    retriever = vectorstore.as_retriever()
+    docs = retriever.invoke(query_text)
+    context = "\n\n".join(doc.page_content for doc in docs)
+    prompt_template = ChatPromptTemplate.from_template(
+        """Answer the question based ONLY on the context below:
 Context: {context}
+Question: {question}"""
+    )
+    prompt = prompt_template.format(context=context, question=query_text)
+    input_ids = llm_tokenizer(prompt, return_tensors="pt").input_ids.to(llm_model.device)
+    output_ids = llm_model.generate(input_ids, max_new_tokens=200)
+    response = llm_tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    return {"response": response.replace(prompt, "").strip()}
 if __name__ == "__main__":
+    uvicorn.run("fastapi_app:app", host="0.0.0.0", port=7860, reload=True)