Spaces:

logasanjeev
/

DocTalk

Sleeping

App Files Files Community

logasanjeev commited on Apr 20, 2025

Commit

8eb9b68

verified ·

1 Parent(s): 10965e6

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -7

app.py CHANGED Viewed

@@ -161,8 +161,8 @@ def initialize_qa_chain(llm_model, temperature):
         return "Please process documents first.", None
     try:
-        # Enable quantization for Mixtral-8x7B to reduce memory usage
-        quantization_config = BitsAndBytesConfig(load_in_4bit=True) if llm_model == "mistralai/Mixtral-8x7B-Instruct-v0.1" else None
         llm = HuggingFaceEndpoint(
             repo_id=LLM_MODELS[llm_model],
             task="text-generation",
@@ -170,7 +170,7 @@ def initialize_qa_chain(llm_model, temperature):
             max_new_tokens=512,
             huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"],
             timeout=30,
-            quantization_config=quantization_config
         )
         # Dynamically set k based on vector store size
         collection = vector_store._collection
@@ -186,9 +186,9 @@ def initialize_qa_chain(llm_model, temperature):
     except requests.exceptions.HTTPError as e:
         logger.error(f"HTTP error initializing QA chain for {llm_model}: {str(e)}")
         if "503" in str(e):
-            return f"Error: Hugging Face API temporarily unavailable for {llm_model}. Try 'Lightweight (Mistral-7B)' or wait and retry.", None
         elif "403" in str(e):
-            return f"Error: Access denied for {llm_model}. Ensure your HF token is valid.", None
         return f"Error initializing QA chain: {str(e)}.", None
     except Exception as e:
         logger.error(f"Error initializing QA chain for {llm_model}: {str(e)}")
@@ -218,9 +218,9 @@ def answer_question(question, llm_model, embedding_model, temperature, chunk_siz
     except requests.exceptions.HTTPError as e:
         logger.error(f"HTTP error answering question: {str(e)}")
         if "503" in str(e):
-            return f"Error: Hugging Face API temporarily unavailable for {llm_model}. Try 'Lightweight (Mistral-7B)' or wait and retry.", chat_history
         elif "403" in str(e):
-            return f"Error: Access denied for {llm_model}. Ensure your HF token is valid.", chat_history
         return f"Error answering question: {str(e)}", chat_history
     except Exception as e:
         logger.error(f"Error answering question: {str(e)}")
@@ -301,6 +301,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="DocTalk: Document Q&A Chatbot") as
         inputs=[llm_model, temperature],
         outputs=[status, chat_display]
     )
     question.submit(
         fn=answer_question,
         inputs=[question, llm_model, embedding_model, temperature, chunk_size, chunk_overlap],

         return "Please process documents first.", None
     try:
+        # Enable 4-bit quantization for all models to reduce memory usage
+        quantization_config = BitsAndBytesConfig(load_in_4bit=True)
         llm = HuggingFaceEndpoint(
             repo_id=LLM_MODELS[llm_model],
             task="text-generation",
             max_new_tokens=512,
             huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"],
             timeout=30,
+            model_kwargs={"quantization_config": quantization_config}
         )
         # Dynamically set k based on vector store size
         collection = vector_store._collection
     except requests.exceptions.HTTPError as e:
         logger.error(f"HTTP error initializing QA chain for {llm_model}: {str(e)}")
         if "503" in str(e):
+            return f"Error: Hugging Face API temporarily unavailable for {llm_model}. Try 'High Accuracy (Mixtral-8x7B)' or wait and retry.", None
         elif "403" in str(e):
+            return f"Error: Access denied for {llm_model}. Check your HF token permissions or upgrade to a Pro account for larger models.", None
         return f"Error initializing QA chain: {str(e)}.", None
     except Exception as e:
         logger.error(f"Error initializing QA chain for {llm_model}: {str(e)}")
     except requests.exceptions.HTTPError as e:
         logger.error(f"HTTP error answering question: {str(e)}")
         if "503" in str(e):
+            return f"Error: Hugging Face API temporarily unavailable for {llm_model}. Try 'High Accuracy (Mixtral-8x7B)' or wait and retry.", chat_history
         elif "403" in str(e):
+            return f"Error: Access denied for {llm_model}. Check your HF token permissions or upgrade to a Pro account for larger models.", chat_history
         return f"Error answering question: {str(e)}", chat_history
     except Exception as e:
         logger.error(f"Error answering question: {str(e)}")
         inputs=[llm_model, temperature],
         outputs=[status, chat_display]
     )
+    question里的
     question.submit(
         fn=answer_question,
         inputs=[question, llm_model, embedding_model, temperature, chunk_size, chunk_overlap],