st192011 commited on
Commit
f30d567
·
verified ·
1 Parent(s): 2eee276

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -10
app.py CHANGED
@@ -4,7 +4,7 @@ import json
4
  import os
5
  from transformers import AutoModelForCausalLM, AutoTokenizer
6
  from peft import PeftModel
7
- from huggingface_hub import InferenceClient
8
 
9
  # ==============================================================================
10
  # 1. CONFIGURATION
@@ -13,19 +13,19 @@ HF_TOKEN = os.getenv("HF_TOKEN")
13
  PROJECT_TITLE = "The Janus Interface: Semantic Decoupling Architecture"
14
 
15
  # Models
16
- BASE_MODEL_ID = "microsoft/Phi-3.5-mini-instruct" # Official repo for better CPU compatibility
17
  ADAPTER_ID = "st192011/janus-gold-lora"
18
  CLOUD_MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
19
 
20
  # ==============================================================================
21
- # 2. ENGINE INITIALIZATION (CPU Optimized)
22
  # ==============================================================================
23
  print("⏳ Initializing Neural Backbone...")
24
 
25
  try:
26
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
27
 
28
- # CRITICAL FIX: attn_implementation="eager" prevents the DynamicCache error on CPU
29
  base_model = AutoModelForCausalLM.from_pretrained(
30
  BASE_MODEL_ID,
31
  torch_dtype=torch.bfloat16,
@@ -34,8 +34,27 @@ try:
34
  attn_implementation="eager"
35
  )
36
 
37
- print(f"⏳ Mounting Janus Adapter ({ADAPTER_ID})...")
38
- model = PeftModel.from_pretrained(base_model, ADAPTER_ID)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  model.eval()
40
  print("✅ System Online.")
41
 
@@ -54,7 +73,6 @@ def clean_output(text):
54
  clean = text.replace("<|end|>", "").replace("<|endoftext|>", "")
55
  if "Output:" in clean: clean = clean.split("Output:")[-1]
56
 
57
- # Remove conversational filler lines
58
  lines = clean.split('\n')
59
  valid_lines = [line for line in lines if ":" in line and "Note" not in line]
60
  return " ".join(valid_lines).strip()
@@ -82,7 +100,7 @@ RAW NOTE:
82
  max_new_tokens=256,
83
  temperature=0.1,
84
  do_sample=True,
85
- use_cache=True # Enabled for speed
86
  )
87
 
88
  text = tokenizer.batch_decode(outputs)[0]
@@ -136,11 +154,11 @@ PRIVATE_DB:
136
  with torch.no_grad():
137
  outputs = model.generate(
138
  **inputs,
139
- max_new_tokens=600, # Reduced slightly to ensure completion on CPU
140
  temperature=0.1,
141
  repetition_penalty=1.05,
142
  do_sample=True,
143
- use_cache=True # Enabled for speed
144
  )
145
 
146
  text = tokenizer.batch_decode(outputs)[0]
@@ -228,6 +246,7 @@ To achieve high fidelity without using private patient data, we developed a **Te
228
  * **Base Model:** Microsoft Phi-3.5-mini-instruct (3.8B Parameters).
229
  * **Framework:** **Unsloth** (Optimized QLoRA).
230
  * **Technique:** **DoRA (Weight-Decomposed Low-Rank Adaptation)**.
 
231
  * **Loss Masking:** We used `train_on_responses_only`. The model was **never** trained on the input text, only on the output. This prevents the model from memorizing patient PII from the training set.
232
  * **Hyperparameters:** Rank 16, Alpha 16, Learning Rate 2e-4, 2 Epochs (306 samples).
233
 
 
4
  import os
5
  from transformers import AutoModelForCausalLM, AutoTokenizer
6
  from peft import PeftModel
7
+ from huggingface_hub import InferenceClient, snapshot_download
8
 
9
  # ==============================================================================
10
  # 1. CONFIGURATION
 
13
  PROJECT_TITLE = "The Janus Interface: Semantic Decoupling Architecture"
14
 
15
  # Models
16
+ BASE_MODEL_ID = "microsoft/Phi-3.5-mini-instruct"
17
  ADAPTER_ID = "st192011/janus-gold-lora"
18
  CLOUD_MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
19
 
20
  # ==============================================================================
21
+ # 2. ENGINE INITIALIZATION (CPU Optimized + Config Sanitizer)
22
  # ==============================================================================
23
  print("⏳ Initializing Neural Backbone...")
24
 
25
  try:
26
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
27
 
28
+ # Load Base Model
29
  base_model = AutoModelForCausalLM.from_pretrained(
30
  BASE_MODEL_ID,
31
  torch_dtype=torch.bfloat16,
 
34
  attn_implementation="eager"
35
  )
36
 
37
+ print(f"⏳ Downloading and sanitizing adapter ({ADAPTER_ID})...")
38
+
39
+ # 1. Download the adapter files locally
40
+ local_adapter_path = snapshot_download(repo_id=ADAPTER_ID, token=HF_TOKEN)
41
+
42
+ # 2. Load the config JSON
43
+ config_path = os.path.join(local_adapter_path, "adapter_config.json")
44
+ with open(config_path, "r") as f:
45
+ config_data = json.load(f)
46
+
47
+ # 3. Remove the key that causes the crash
48
+ if "alora_invocation_tokens" in config_data:
49
+ print("🧹 Cleaning incompatible Unsloth config keys...")
50
+ del config_data["alora_invocation_tokens"]
51
+
52
+ # Save the clean config back to disk
53
+ with open(config_path, "w") as f:
54
+ json.dump(config_data, f, indent=2)
55
+
56
+ # 4. Load the adapter from the local sanitized folder
57
+ model = PeftModel.from_pretrained(base_model, local_adapter_path)
58
  model.eval()
59
  print("✅ System Online.")
60
 
 
73
  clean = text.replace("<|end|>", "").replace("<|endoftext|>", "")
74
  if "Output:" in clean: clean = clean.split("Output:")[-1]
75
 
 
76
  lines = clean.split('\n')
77
  valid_lines = [line for line in lines if ":" in line and "Note" not in line]
78
  return " ".join(valid_lines).strip()
 
100
  max_new_tokens=256,
101
  temperature=0.1,
102
  do_sample=True,
103
+ use_cache=True
104
  )
105
 
106
  text = tokenizer.batch_decode(outputs)[0]
 
154
  with torch.no_grad():
155
  outputs = model.generate(
156
  **inputs,
157
+ max_new_tokens=600,
158
  temperature=0.1,
159
  repetition_penalty=1.05,
160
  do_sample=True,
161
+ use_cache=True
162
  )
163
 
164
  text = tokenizer.batch_decode(outputs)[0]
 
246
  * **Base Model:** Microsoft Phi-3.5-mini-instruct (3.8B Parameters).
247
  * **Framework:** **Unsloth** (Optimized QLoRA).
248
  * **Technique:** **DoRA (Weight-Decomposed Low-Rank Adaptation)**.
249
+ * *Why DoRA?* Standard LoRA struggles with strict syntax/coding tasks. DoRA updates both magnitude and direction vectors, allowing the model to learn the strict `JanusScript` grammar effectively.
250
  * **Loss Masking:** We used `train_on_responses_only`. The model was **never** trained on the input text, only on the output. This prevents the model from memorizing patient PII from the training set.
251
  * **Hyperparameters:** Rank 16, Alpha 16, Learning Rate 2e-4, 2 Epochs (306 samples).
252