Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -16,7 +16,6 @@ import chromadb
|
|
| 16 |
import tempfile
|
| 17 |
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
|
| 18 |
import requests
|
| 19 |
-
from transformers import BitsAndBytesConfig
|
| 20 |
|
| 21 |
# Set up logging
|
| 22 |
logging.basicConfig(level=logging.INFO)
|
|
@@ -161,16 +160,13 @@ def initialize_qa_chain(llm_model, temperature):
|
|
| 161 |
return "Please process documents first.", None
|
| 162 |
|
| 163 |
try:
|
| 164 |
-
# Enable 4-bit quantization for all models to reduce memory usage
|
| 165 |
-
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
|
| 166 |
llm = HuggingFaceEndpoint(
|
| 167 |
repo_id=LLM_MODELS[llm_model],
|
| 168 |
task="text-generation",
|
| 169 |
temperature=float(temperature),
|
| 170 |
max_new_tokens=512,
|
| 171 |
huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"],
|
| 172 |
-
timeout=30
|
| 173 |
-
model_kwargs={"quantization_config": quantization_config}
|
| 174 |
)
|
| 175 |
# Dynamically set k based on vector store size
|
| 176 |
collection = vector_store._collection
|
|
|
|
| 16 |
import tempfile
|
| 17 |
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
|
| 18 |
import requests
|
|
|
|
| 19 |
|
| 20 |
# Set up logging
|
| 21 |
logging.basicConfig(level=logging.INFO)
|
|
|
|
| 160 |
return "Please process documents first.", None
|
| 161 |
|
| 162 |
try:
|
|
|
|
|
|
|
| 163 |
llm = HuggingFaceEndpoint(
|
| 164 |
repo_id=LLM_MODELS[llm_model],
|
| 165 |
task="text-generation",
|
| 166 |
temperature=float(temperature),
|
| 167 |
max_new_tokens=512,
|
| 168 |
huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"],
|
| 169 |
+
timeout=30
|
|
|
|
| 170 |
)
|
| 171 |
# Dynamically set k based on vector store size
|
| 172 |
collection = vector_store._collection
|