Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -161,8 +161,8 @@ def initialize_qa_chain(llm_model, temperature):
|
|
| 161 |
return "Please process documents first.", None
|
| 162 |
|
| 163 |
try:
|
| 164 |
-
# Enable quantization for
|
| 165 |
-
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
|
| 166 |
llm = HuggingFaceEndpoint(
|
| 167 |
repo_id=LLM_MODELS[llm_model],
|
| 168 |
task="text-generation",
|
|
@@ -170,7 +170,7 @@ def initialize_qa_chain(llm_model, temperature):
|
|
| 170 |
max_new_tokens=512,
|
| 171 |
huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"],
|
| 172 |
timeout=30,
|
| 173 |
-
|
| 174 |
)
|
| 175 |
# Dynamically set k based on vector store size
|
| 176 |
collection = vector_store._collection
|
|
@@ -186,9 +186,9 @@ def initialize_qa_chain(llm_model, temperature):
|
|
| 186 |
except requests.exceptions.HTTPError as e:
|
| 187 |
logger.error(f"HTTP error initializing QA chain for {llm_model}: {str(e)}")
|
| 188 |
if "503" in str(e):
|
| 189 |
-
return f"Error: Hugging Face API temporarily unavailable for {llm_model}. Try '
|
| 190 |
elif "403" in str(e):
|
| 191 |
-
return f"Error: Access denied for {llm_model}.
|
| 192 |
return f"Error initializing QA chain: {str(e)}.", None
|
| 193 |
except Exception as e:
|
| 194 |
logger.error(f"Error initializing QA chain for {llm_model}: {str(e)}")
|
|
@@ -218,9 +218,9 @@ def answer_question(question, llm_model, embedding_model, temperature, chunk_siz
|
|
| 218 |
except requests.exceptions.HTTPError as e:
|
| 219 |
logger.error(f"HTTP error answering question: {str(e)}")
|
| 220 |
if "503" in str(e):
|
| 221 |
-
return f"Error: Hugging Face API temporarily unavailable for {llm_model}. Try '
|
| 222 |
elif "403" in str(e):
|
| 223 |
-
return f"Error: Access denied for {llm_model}.
|
| 224 |
return f"Error answering question: {str(e)}", chat_history
|
| 225 |
except Exception as e:
|
| 226 |
logger.error(f"Error answering question: {str(e)}")
|
|
@@ -301,6 +301,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="DocTalk: Document Q&A Chatbot") as
|
|
| 301 |
inputs=[llm_model, temperature],
|
| 302 |
outputs=[status, chat_display]
|
| 303 |
)
|
|
|
|
| 304 |
question.submit(
|
| 305 |
fn=answer_question,
|
| 306 |
inputs=[question, llm_model, embedding_model, temperature, chunk_size, chunk_overlap],
|
|
|
|
| 161 |
return "Please process documents first.", None
|
| 162 |
|
| 163 |
try:
|
| 164 |
+
# Enable 4-bit quantization for all models to reduce memory usage
|
| 165 |
+
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
|
| 166 |
llm = HuggingFaceEndpoint(
|
| 167 |
repo_id=LLM_MODELS[llm_model],
|
| 168 |
task="text-generation",
|
|
|
|
| 170 |
max_new_tokens=512,
|
| 171 |
huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"],
|
| 172 |
timeout=30,
|
| 173 |
+
model_kwargs={"quantization_config": quantization_config}
|
| 174 |
)
|
| 175 |
# Dynamically set k based on vector store size
|
| 176 |
collection = vector_store._collection
|
|
|
|
| 186 |
except requests.exceptions.HTTPError as e:
|
| 187 |
logger.error(f"HTTP error initializing QA chain for {llm_model}: {str(e)}")
|
| 188 |
if "503" in str(e):
|
| 189 |
+
return f"Error: Hugging Face API temporarily unavailable for {llm_model}. Try 'High Accuracy (Mixtral-8x7B)' or wait and retry.", None
|
| 190 |
elif "403" in str(e):
|
| 191 |
+
return f"Error: Access denied for {llm_model}. Check your HF token permissions or upgrade to a Pro account for larger models.", None
|
| 192 |
return f"Error initializing QA chain: {str(e)}.", None
|
| 193 |
except Exception as e:
|
| 194 |
logger.error(f"Error initializing QA chain for {llm_model}: {str(e)}")
|
|
|
|
| 218 |
except requests.exceptions.HTTPError as e:
|
| 219 |
logger.error(f"HTTP error answering question: {str(e)}")
|
| 220 |
if "503" in str(e):
|
| 221 |
+
return f"Error: Hugging Face API temporarily unavailable for {llm_model}. Try 'High Accuracy (Mixtral-8x7B)' or wait and retry.", chat_history
|
| 222 |
elif "403" in str(e):
|
| 223 |
+
return f"Error: Access denied for {llm_model}. Check your HF token permissions or upgrade to a Pro account for larger models.", chat_history
|
| 224 |
return f"Error answering question: {str(e)}", chat_history
|
| 225 |
except Exception as e:
|
| 226 |
logger.error(f"Error answering question: {str(e)}")
|
|
|
|
| 301 |
inputs=[llm_model, temperature],
|
| 302 |
outputs=[status, chat_display]
|
| 303 |
)
|
| 304 |
+
question里的
|
| 305 |
question.submit(
|
| 306 |
fn=answer_question,
|
| 307 |
inputs=[question, llm_model, embedding_model, temperature, chunk_size, chunk_overlap],
|