Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -751,13 +751,33 @@ def init_model_if_needed():
|
|
| 751 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 752 |
if tokenizer.pad_token is None:
|
| 753 |
tokenizer.pad_token = tokenizer.eos_token
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 754 |
|
| 755 |
-
dtype = torch.float16 if device.type == "cuda" else torch.float32
|
| 756 |
model = AutoModelForCausalLM.from_pretrained(
|
| 757 |
MODEL_NAME,
|
| 758 |
-
|
| 759 |
-
|
|
|
|
|
|
|
| 760 |
)
|
|
|
|
|
|
|
| 761 |
model.to(device)
|
| 762 |
model.eval()
|
| 763 |
|
|
@@ -966,13 +986,23 @@ def generate_reply(user_message, history_context=""):
|
|
| 966 |
facts = dedupe_facts([extra_fact] + facts)
|
| 967 |
|
| 968 |
draft = compose_draft_from_facts(facts)
|
| 969 |
-
|
| 970 |
-
reply = polish_with_model(user_message, draft, facts, history_context)
|
| 971 |
-
if reply and not is_generic_or_placeholder_answer(reply):
|
| 972 |
-
|
| 973 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 974 |
return general_chat_reply(user_message, history_context)
|
| 975 |
-
|
| 976 |
# =========================================================
|
| 977 |
# API
|
| 978 |
# =========================================================
|
|
|
|
| 751 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 752 |
if tokenizer.pad_token is None:
|
| 753 |
tokenizer.pad_token = tokenizer.eos_token
|
| 754 |
+
#################################################################################
|
| 755 |
+
#dtype = torch.float16 if device.type == "cuda" else torch.float32
|
| 756 |
+
#model = AutoModelForCausalLM.from_pretrained(
|
| 757 |
+
# MODEL_NAME,
|
| 758 |
+
# torch_dtype=dtype,
|
| 759 |
+
# low_cpu_mem_usage=True
|
| 760 |
+
#)
|
| 761 |
+
#################################################################################
|
| 762 |
+
# 4-Bit Quantisierung für 16 GB RAM (benötigt 'bitsandbytes' und 'accelerate')
|
| 763 |
+
from transformers import BitsAndBytesConfig
|
| 764 |
+
|
| 765 |
+
bnb_config = BitsAndBytesConfig(
|
| 766 |
+
load_in_4bit=True,
|
| 767 |
+
bnb_4bit_compute_dtype=torch.float16,
|
| 768 |
+
bnb_4bit_quant_type="nf4",
|
| 769 |
+
bnb_4bit_use_double_quant=True,
|
| 770 |
+
)
|
| 771 |
|
|
|
|
| 772 |
model = AutoModelForCausalLM.from_pretrained(
|
| 773 |
MODEL_NAME,
|
| 774 |
+
quantization_config=bnb_config,
|
| 775 |
+
device_map="auto", # Verteilt das Modell optimal auf GPU/CPU
|
| 776 |
+
low_cpu_mem_usage=True,
|
| 777 |
+
token=HF_TOKEN
|
| 778 |
)
|
| 779 |
+
|
| 780 |
+
#################################################################################
|
| 781 |
model.to(device)
|
| 782 |
model.eval()
|
| 783 |
|
|
|
|
| 986 |
facts = dedupe_facts([extra_fact] + facts)
|
| 987 |
|
| 988 |
draft = compose_draft_from_facts(facts)
|
| 989 |
+
######################################################################################################
|
| 990 |
+
#reply = polish_with_model(user_message, draft, facts, history_context)
|
| 991 |
+
#if reply and not is_generic_or_placeholder_answer(reply):
|
| 992 |
+
# return reply
|
| 993 |
+
#
|
| 994 |
+
# return general_chat_reply(user_message, history_context)
|
| 995 |
+
######################################################################################################
|
| 996 |
+
# Nur polieren, wenn wirklich relevante Fakten gefunden wurden
|
| 997 |
+
if facts and len(facts) > 0:
|
| 998 |
+
reply = polish_with_model(user_message, draft, facts, history_context)
|
| 999 |
+
# Wenn das Polieren geklappt hat und keine Standard-Floskel ist, nimm es
|
| 1000 |
+
if reply and not is_generic_or_placeholder_answer(reply):
|
| 1001 |
+
return reply
|
| 1002 |
+
|
| 1003 |
+
# Falls keine Fakten da sind oder das Polieren Mist war: Normaler Chat
|
| 1004 |
return general_chat_reply(user_message, history_context)
|
| 1005 |
+
##################################################################################################
|
| 1006 |
# =========================================================
|
| 1007 |
# API
|
| 1008 |
# =========================================================
|