Spaces:

hmm183
/

LLM

Runtime error

App Files Files Community

hmm183 commited on Jun 4, 2025

Commit

f4b962e

verified ·

1 Parent(s): 71469fb

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -48

app.py CHANGED Viewed

@@ -1,16 +1,25 @@
-import os # Import os at the top
-# Set a writable cache directory for transformers
-os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
 from flask import Flask, request, jsonify
 from flask_cors import CORS
-# No requests import needed for Ollama connection check if not using Ollama
-# Import Hugging Face Transformers
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-import torch
-from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import Chroma
 from langchain_core.documents import Document
 from langchain_core.prompts import ChatPromptTemplate
@@ -19,94 +28,126 @@ from langchain_core.runnables import RunnablePassthrough
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 app = Flask(__name__)
-CORS(app)
-# ... (rest of your app.py code) ...
 # --- Model Configuration for Hugging Face Transformers ---
-# CHOOSE A SMALLER MODEL! Gemma 4B is too large for free tier usually.
-# 'google/gemma-2b-it' is a good conversational starting point.
 LLM_MODEL_NAME_HF = "google/gemma-2b-it"
-EMBEDDING_MODEL_NAME_HF = "sentence-transformers/all-MiniLM-L6-v2" # Standard small embedding model
 # Global variables for models
-llm_pipeline = None # Will be a Hugging Face pipeline
-embeddings = None   # Will be a HuggingFaceEmbeddings instance
 # --- User-specific Vector Stores Cache ---
 user_vectorstores = {}
 def initialize_models():
     """
     Initialize Hugging Face models (LLM pipeline and Embeddings).
     """
     global llm_pipeline, embeddings
     print("Initializing Hugging Face models...")
     try:
         # Determine device for LLM: Use GPU if available, otherwise CPU
         device = 0 if torch.cuda.is_available() else -1
-        print(f"Using device: {'cuda' if device == 0 else 'cpu'}")
-        # Initialize LLM Pipeline
-        # This will download the model weights (gemma-2b-it is ~5GB)
-        # It's recommended to do this once at startup.
         print(f"Loading LLM: {LLM_MODEL_NAME_HF}...")
         tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME_HF)
-        model = AutoModelForCausalLM.from_pretrained(LLM_MODEL_NAME_HF, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32)
         llm_pipeline = pipeline(
             "text-generation",
             model=model,
             tokenizer=tokenizer,
-            max_new_tokens=500, # Limit response length
-            device=device,
-            # Add other generation parameters as needed, e.g., do_sample=True, top_p=0.9, temperature=0.7
         )
         print("LLM Pipeline initialized successfully!")
-        # Initialize Hugging Face Embeddings
         print(f"Loading Embedding Model: {EMBEDDING_MODEL_NAME_HF}...")
-        embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME_HF)
         print("Embedding Model initialized successfully!")
     except Exception as e:
         print(f"ERROR: An unexpected error occurred during model initialization: {e}")
         llm_pipeline = None
         embeddings = None
-        # Raise the exception to prevent the app from starting if models fail to load
         raise e
-# --- Helper function to adapt HF pipeline to LangChain's LLM interface ---
-# LangChain's pipeline.py can convert HF pipelines but requires some setup.
-# For simplicity, we'll manually wrap it in the RAG chain
-# We will use it directly in the RAG chain's invoke step.
 @app.route('/load_document', methods=['POST'])
 def load_document():
-    # ... (rest of your /load_document function remains largely the same) ...
-    # Ensure 'embeddings' is properly loaded before this.
     if not embeddings:
         return jsonify({"error": "Embedding model not initialized. Server might be restarting or failed to load models."}), 500
     data = request.get_json()
-    user_id = data.get("user_id")
     text = data.get("text")
-    if not user_id: return jsonify({"error": "User ID (user_id) is required to load a document."}), 400
-    if not text: return jsonify({"error": "No text provided to load."}), 400
     print(f"Loading document for user: {user_id}")
     try:
         # Create a unique persistence directory for each user's ChromaDB
-        # NOTE: On Hugging Face Spaces, this persist_dir will be within the Space's storage,
-        # which can be ephemeral or reset, depending on space type/resource usage.
-        # For a true persistent solution, you'd need external storage.
-        persist_dir = f"./chroma_db_users/{user_id}/"
         os.makedirs(persist_dir, exist_ok=True)
         base_document = Document(page_content=text, metadata={"user_id": user_id, "source": "user_upload"})
-        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
         chunks = text_splitter.split_documents([base_document])
         user_vectorstores[user_id] = Chroma.from_documents(
             chunks, embedding=embeddings, persist_directory=persist_dir
         )
@@ -115,9 +156,10 @@ def load_document():
         return jsonify({"message": f"Document loaded successfully for user '{user_id}'.", "chunks_created": len(chunks)})
     except Exception as e:
         print(f"Error loading document for user '{user_id}': {e}")
         return jsonify({"error": f"Error loading document: {e}"}), 500
 @app.route('/query', methods=['POST'])
 def query():
     """
@@ -130,18 +172,24 @@ def query():
     user_id = data.get("user_id")
     query_text = data.get("query")
-    if not user_id: return jsonify({"error": "User ID (user_id) is required to query."}), 400
-    if not query_text: return jsonify({"error": "No query text provided."}), 400
     print(f"Query received for user: {user_id}, Query: '{query_text}'")
     current_user_vectorstore = user_vectorstores.get(user_id)
     if not current_user_vectorstore:
-        user_persist_dir = f"./chroma_db_users/{user_id}/"
         if os.path.exists(user_persist_dir):
             try:
                 current_user_vectorstore = Chroma(persist_directory=user_persist_dir, embedding_function=embeddings)
-                user_vectorstores[user_id] = current_user_vectorstore
                 print(f"Loaded existing vectorstore for user '{user_id}' from disk.")
             except Exception as e:
                 print(f"Error loading vectorstore from disk for user '{user_id}': {e}")
@@ -152,6 +200,7 @@ def query():
     try:
         retriever = current_user_vectorstore.as_retriever()
         prompt_template = ChatPromptTemplate.from_template(
             """Answer the question based ONLY on the following context. If the answer is not available in the provided context, politely state that you cannot find the answer in the provided information.
@@ -170,12 +219,10 @@ Question: {question}
         formatted_prompt = prompt_template.format(context=context_text, question=query_text)
         # Use the Hugging Face pipeline directly for text generation
-        # Pass the formatted prompt to the pipeline
         outputs = llm_pipeline(formatted_prompt)
         # The output from the pipeline needs to be parsed based on its structure
         # It's usually a list of dictionaries, with 'generated_text' key.
-        # You might need to refine this parsing based on the exact model's output format.
         generated_text = outputs[0]['generated_text']
         # The model might repeat the prompt or parts of it, extract only the new response.
@@ -187,6 +234,17 @@ Question: {question}
         else:
             response = generated_text.strip() # Fallback if prompt isn't found perfectly
         print(f"Response generated for user '{user_id}'.")
         return jsonify({"response": response})
     except Exception as e:
@@ -205,4 +263,5 @@ if __name__ == "__main__":
     print(" - POST /query (Requires 'user_id' and 'query')")
     # Hugging Face Spaces typically runs on port 7860
-    app.run(host="0.0.0.0", port=7860)

+import os
+import requests # Used for checking Ollama connection in a commented-out section, can be removed if not needed.
+# --- IMPORTANT: Set a writable cache directory for Hugging Face models ---
+# This is crucial for environments like Hugging Face Spaces where default cache locations
+# might not be writable or persistent. /tmp is usually writable.
+# HF_HOME is the preferred environment variable for Hugging Face cache.
+os.environ["HF_HOME"] = "/tmp/huggingface_cache"
+# Ensure the directory exists
+os.makedirs(os.environ["HF_HOME"], exist_ok=True)
+# --- Flask and CORS ---
 from flask import Flask, request, jsonify
 from flask_cors import CORS
+# --- LangChain and Hugging Face Libraries ---
+# Note: We are NOT using Ollama directly in this app.py for Hugging Face Spaces.
+# Instead, we are loading models directly via Hugging Face's transformers library.
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+import torch # For checking GPU availability and model dtype
+from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import Chroma
 from langchain_core.documents import Document
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 app = Flask(__name__)
+CORS(app) # Allow all origins (good for development/MVP on HF Spaces)
 # --- Model Configuration for Hugging Face Transformers ---
+# These models will be downloaded directly by the 'transformers' library.
+# 'google/gemma-2b-it' is chosen for its size and instruction-following capabilities.
+# 'sentence-transformers/all-MiniLM-L6-v2' is a small, efficient embedding model.
 LLM_MODEL_NAME_HF = "google/gemma-2b-it"
+EMBEDDING_MODEL_NAME_HF = "sentence-transformers/all-MiniLM-L6-v2"
 # Global variables for models
+llm_pipeline = None # Will hold the Hugging Face text-generation pipeline
+embeddings = None   # Will hold the HuggingFaceEmbeddings instance
 # --- User-specific Vector Stores Cache ---
+# This dictionary will hold Chroma instances, keyed by user_id.
+# IMPORTANT MVP LIMITATION: This is an in-memory cache.
+# - If the app restarts, all loaded user contexts are lost from memory (though
+#   Chroma data is saved to disk in `chroma_db_users`).
+# - For true concurrency and persistence, you'd load from disk on demand or use an external DB.
 user_vectorstores = {}
 def initialize_models():
     """
     Initialize Hugging Face models (LLM pipeline and Embeddings).
+    This function is called once when the Flask app starts.
     """
     global llm_pipeline, embeddings
     print("Initializing Hugging Face models...")
     try:
         # Determine device for LLM: Use GPU if available, otherwise CPU
+        # On Hugging Face Spaces free tier, it's usually CPU (-1).
         device = 0 if torch.cuda.is_available() else -1
+        print(f"Using device for LLM: {'cuda' if device == 0 else 'cpu'}")
+        # --- Initialize LLM Pipeline (google/gemma-2b-it) ---
         print(f"Loading LLM: {LLM_MODEL_NAME_HF}...")
+        # AutoTokenizer and AutoModelForCausalLM will use HF_HOME for caching.
         tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME_HF)
+        # Use bfloat16 for GPU if available to save memory, otherwise float32 for CPU.
+        model = AutoModelForCausalLM.from_pretrained(
+            LLM_MODEL_NAME_HF,
+            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
+        )
         llm_pipeline = pipeline(
             "text-generation",
             model=model,
             tokenizer=tokenizer,
+            max_new_tokens=500, # Max tokens for the generated response
+            device=device,      # Use the determined device (CPU or GPU)
+            do_sample=True,     # Enable sampling for more varied responses
+            temperature=0.7,    # Control randomness (lower for more focused, higher for more creative)
+            top_p=0.9,          # Nucleus sampling
+            top_k=50,           # Top-k sampling
+            # Stop sequences for generation to prevent model from continuing beyond the answer
+            # These are crucial for chat models used in RAG.
+            eos_token_id=tokenizer.eos_token_id, # End of sequence token
+            pad_token_id=tokenizer.pad_token_id # Pad token ID
         )
         print("LLM Pipeline initialized successfully!")
+        # --- Initialize Hugging Face Embeddings (all-MiniLM-L6-v2) ---
         print(f"Loading Embedding Model: {EMBEDDING_MODEL_NAME_HF}...")
+        embeddings = HuggingFaceEmbeddings(
+            model_name=EMBEDDING_MODEL_NAME_HF,
+            # Explicitly set cache_folder to ensure it uses the writable directory
+            cache_folder=os.environ["HF_HOME"],
+            # IMPORTANT: local_files_only=True means it will NOT try to download if not found.
+            # If you want it to download if not present, remove this line or set to False.
+            # For robust deployment, pre-caching and uploading the model is recommended.
+            model_kwargs={"local_files_only": False} # Set to False to allow download if not cached
+        )
         print("Embedding Model initialized successfully!")
     except Exception as e:
         print(f"ERROR: An unexpected error occurred during model initialization: {e}")
         llm_pipeline = None
         embeddings = None
+        # Re-raise the exception to prevent the Flask app from starting if models fail to load
         raise e
 @app.route('/load_document', methods=['POST'])
 def load_document():
+    """
+    Load a document for a specific user into their dedicated persistent vector store.
+    The text is chunked for better retrieval.
+    """
     if not embeddings:
         return jsonify({"error": "Embedding model not initialized. Server might be restarting or failed to load models."}), 500
     data = request.get_json()
+    user_id = data.get("user_id") # Expecting a user_id from the client
     text = data.get("text")
+    if not user_id:
+        return jsonify({"error": "User ID (user_id) is required to load a document."}), 400
+    if not text:
+        return jsonify({"error": "No text provided to load."}), 400
     print(f"Loading document for user: {user_id}")
     try:
         # Create a unique persistence directory for each user's ChromaDB
+        # This will be within the Space's storage, which can be ephemeral on restarts.
+        persist_dir = f"{os.environ['HF_HOME']}/chroma_db_users/{user_id}/"
         os.makedirs(persist_dir, exist_ok=True)
+        # Wrap the input text in a LangChain Document
         base_document = Document(page_content=text, metadata={"user_id": user_id, "source": "user_upload"})
+        # Chunk the document for better retrieval performance
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,      # Max characters per chunk
+            chunk_overlap=200,    # Overlap between chunks to maintain context
+            length_function=len,
+            is_separator_regex=False,
+        )
         chunks = text_splitter.split_documents([base_document])
+        # Create/overwrite the vector store for this specific user
+        # This will save to the user-specific directory on disk.
         user_vectorstores[user_id] = Chroma.from_documents(
             chunks, embedding=embeddings, persist_directory=persist_dir
         )
         return jsonify({"message": f"Document loaded successfully for user '{user_id}'.", "chunks_created": len(chunks)})
     except Exception as e:
         print(f"Error loading document for user '{user_id}': {e}")
+        import traceback
+        traceback.print_exc() # Print full traceback for debugging
         return jsonify({"error": f"Error loading document: {e}"}), 500
 @app.route('/query', methods=['POST'])
 def query():
     """
     user_id = data.get("user_id")
     query_text = data.get("query")
+    if not user_id:
+        return jsonify({"error": "User ID (user_id) is required to query."}), 400
+    if not query_text:
+        return jsonify({"error": "No query text provided."}), 400
     print(f"Query received for user: {user_id}, Query: '{query_text}'")
+    # Retrieve the vector store for this specific user from the cache
     current_user_vectorstore = user_vectorstores.get(user_id)
+    # If not in memory, attempt to load from disk for this user
     if not current_user_vectorstore:
+        user_persist_dir = f"{os.environ['HF_HOME']}/chroma_db_users/{user_id}/"
         if os.path.exists(user_persist_dir):
             try:
+                # Load the existing vectorstore from disk
                 current_user_vectorstore = Chroma(persist_directory=user_persist_dir, embedding_function=embeddings)
+                user_vectorstores[user_id] = current_user_vectorstore # Cache it in memory for subsequent queries
                 print(f"Loaded existing vectorstore for user '{user_id}' from disk.")
             except Exception as e:
                 print(f"Error loading vectorstore from disk for user '{user_id}': {e}")
     try:
         retriever = current_user_vectorstore.as_retriever()
+        # Create a prompt template geared toward Q&A based on context
         prompt_template = ChatPromptTemplate.from_template(
             """Answer the question based ONLY on the following context. If the answer is not available in the provided context, politely state that you cannot find the answer in the provided information.
         formatted_prompt = prompt_template.format(context=context_text, question=query_text)
         # Use the Hugging Face pipeline directly for text generation
         outputs = llm_pipeline(formatted_prompt)
         # The output from the pipeline needs to be parsed based on its structure
         # It's usually a list of dictionaries, with 'generated_text' key.
         generated_text = outputs[0]['generated_text']
         # The model might repeat the prompt or parts of it, extract only the new response.
         else:
             response = generated_text.strip() # Fallback if prompt isn't found perfectly
+        # Further clean-up to remove any trailing prompt parts the model might generate
+        if response.startswith("Summary:"):
+            response = response[len("Summary:"):].strip()
+        if response.startswith("Answer:"):
+            response = response[len("Answer:"):].strip()
+        if response.startswith("Question:"):
+            response = response[len("Question:"):].strip()
+        if response.startswith("Context:"):
+            response = response[len("Context:"):].strip()
         print(f"Response generated for user '{user_id}'.")
         return jsonify({"response": response})
     except Exception as e:
     print(" - POST /query (Requires 'user_id' and 'query')")
     # Hugging Face Spaces typically runs on port 7860
+    app.run(host="0.0.0.0", port=7860)