Spaces:
Running
on
Zero
Running
on
Zero
Update app_local.py
Browse files- app_local.py +9 -8
app_local.py
CHANGED
|
@@ -17,14 +17,10 @@ os.environ.setdefault('GRADIO_ANALYTICS_ENABLED', 'False')
|
|
| 17 |
os.environ.setdefault('HF_HUB_DISABLE_TELEMETRY', '1')
|
| 18 |
|
| 19 |
# Model configuration
|
| 20 |
-
REWRITER_MODEL = "Qwen/Qwen1.5-
|
| 21 |
dtype = torch.bfloat16
|
| 22 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 23 |
|
| 24 |
-
# Preload enhancement model at startup
|
| 25 |
-
print("🔄 Loading prompt enhancement model...")
|
| 26 |
-
rewriter_tokenizer = AutoTokenizer.from_pretrained(REWRITER_MODEL)
|
| 27 |
-
|
| 28 |
# Quantization configuration
|
| 29 |
bnb_config = BitsAndBytesConfig(
|
| 30 |
load_in_4bit=True,
|
|
@@ -38,8 +34,12 @@ rewriter_model = AutoModelForCausalLM.from_pretrained(
|
|
| 38 |
torch_dtype=dtype,
|
| 39 |
device_map="auto",
|
| 40 |
quantization_config=bnb_config,
|
| 41 |
-
max_memory={0: "48GiB"}, # Reserve adequate memory
|
| 42 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
print("✅ Enhancement model loaded and ready!")
|
| 44 |
|
| 45 |
SYSTEM_PROMPT_EDIT = '''
|
|
@@ -129,7 +129,7 @@ def extract_json_response(model_output: str) -> str:
|
|
| 129 |
|
| 130 |
def polish_prompt(original_prompt: str) -> str:
|
| 131 |
"""Enhanced prompt rewriting using original system prompt with JSON handling"""
|
| 132 |
-
load_rewriter()
|
| 133 |
|
| 134 |
# Format as Qwen chat
|
| 135 |
messages = [
|
|
@@ -151,7 +151,8 @@ def polish_prompt(original_prompt: str) -> str:
|
|
| 151 |
max_new_tokens=256, # Reduced for better quality
|
| 152 |
do_sample=True,
|
| 153 |
temperature=0.5, # Less creative but more focused
|
| 154 |
-
top_p=0.
|
|
|
|
| 155 |
no_repeat_ngram_size=3,
|
| 156 |
pad_token_id=rewriter_tokenizer.eos_token_id
|
| 157 |
)
|
|
|
|
| 17 |
os.environ.setdefault('HF_HUB_DISABLE_TELEMETRY', '1')
|
| 18 |
|
| 19 |
# Model configuration
|
| 20 |
+
REWRITER_MODEL = "Qwen/Qwen1.5-4B-Chat" # Upgraded to 4B for better JSON handling
|
| 21 |
dtype = torch.bfloat16
|
| 22 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
# Quantization configuration
|
| 25 |
bnb_config = BitsAndBytesConfig(
|
| 26 |
load_in_4bit=True,
|
|
|
|
| 34 |
torch_dtype=dtype,
|
| 35 |
device_map="auto",
|
| 36 |
quantization_config=bnb_config,
|
|
|
|
| 37 |
)
|
| 38 |
+
|
| 39 |
+
# Preload enhancement model at startup
|
| 40 |
+
print("🔄 Loading prompt enhancement model...")
|
| 41 |
+
rewriter_tokenizer = AutoTokenizer.from_pretrained(REWRITER_MODEL)
|
| 42 |
+
|
| 43 |
print("✅ Enhancement model loaded and ready!")
|
| 44 |
|
| 45 |
SYSTEM_PROMPT_EDIT = '''
|
|
|
|
| 129 |
|
| 130 |
def polish_prompt(original_prompt: str) -> str:
|
| 131 |
"""Enhanced prompt rewriting using original system prompt with JSON handling"""
|
| 132 |
+
# load_rewriter()
|
| 133 |
|
| 134 |
# Format as Qwen chat
|
| 135 |
messages = [
|
|
|
|
| 151 |
max_new_tokens=256, # Reduced for better quality
|
| 152 |
do_sample=True,
|
| 153 |
temperature=0.5, # Less creative but more focused
|
| 154 |
+
top_p=0.8,
|
| 155 |
+
repetition_penalty= 1.1,
|
| 156 |
no_repeat_ngram_size=3,
|
| 157 |
pad_token_id=rewriter_tokenizer.eos_token_id
|
| 158 |
)
|