Spaces:

scigeek
/

pharmaguide

Sleeping

App Files Files Community

scigeek commited on 27 days ago

Commit

84a71ec

verified ·

1 Parent(s): 617b2c8

Upload 2 files

Browse files

Fix tokenizer and lifestyle card cleaning

Files changed (2) hide show

app.py +17 -23
function_calling.py +37 -3

app.py CHANGED Viewed

@@ -102,9 +102,11 @@ except Exception as _e:
 print("Loading model...")
 try:
-    from transformers import AutoModelForCausalLM, AutoProcessor
-    processor = AutoProcessor.from_pretrained(MODEL_PATH)
     # On T4 (16GB): load in 4-bit via bitsandbytes to fit the 8B model
     # On H200/ZeroGPU (80GB): load in bfloat16 — no quantization needed
@@ -128,9 +130,10 @@ try:
 except Exception as e:
     print(f"✗ Model load failed entirely: {e}")
-    model         = None
-    processor     = None
-    _MODEL_LOADED = False
 # ── Text cleanup for FDA raw strings ─────────────────────────────────────────
@@ -161,35 +164,26 @@ def _run_inference(user_prompt: str) -> str:
         {"role": "user",   "content": user_prompt},
     ]
-    # apply_chat_template builds the native Gemma 4 ChatML format
-    prompt_text = processor.apply_chat_template(
         messages,
-        tokenize            = False,
         add_generation_prompt = True,
-    ) if hasattr(processor, "apply_chat_template") else user_prompt
-    inputs = processor(
-        text           = prompt_text,
-        return_tensors = "pt",
-        truncation     = True,
-        max_length     = MAX_SEQ_LENGTH,
     ).to(DEVICE)
     with torch.no_grad():
         output_ids = model.generate(
-            **inputs,
             max_new_tokens = MAX_NEW_TOKENS,
             temperature    = 0.7,
             top_p          = 0.9,
             do_sample      = True,
-            pad_token_id   = (
-                processor.tokenizer.eos_token_id
-                if hasattr(processor, "tokenizer")
-                else processor.eos_token_id
-            ),
         )
-    new_tokens = output_ids[0][inputs["input_ids"].shape[1]:]
     return processor.decode(new_tokens, skip_special_tokens=True).strip()
@@ -279,7 +273,7 @@ def on_photo_submit(image, age: int):
         yield "", "⚠️ The model is not loaded. Cannot process the image."
         return
-    drug_name = extract_drug_name_from_image(image, model, processor, device=DEVICE)
     if not drug_name:
         yield (

 print("Loading model...")
 try:
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    from transformers import AutoProcessor
+    processor       = AutoTokenizer.from_pretrained(MODEL_PATH)   # text inference
+    vision_processor = AutoProcessor.from_pretrained(MODEL_PATH)  # image (photo tab)
     # On T4 (16GB): load in 4-bit via bitsandbytes to fit the 8B model
     # On H200/ZeroGPU (80GB): load in bfloat16 — no quantization needed
 except Exception as e:
     print(f"✗ Model load failed entirely: {e}")
+    model            = None
+    processor        = None
+    vision_processor = None
+    _MODEL_LOADED    = False
 # ── Text cleanup for FDA raw strings ─────────────────────────────────────────
         {"role": "user",   "content": user_prompt},
     ]
+    # apply_chat_template formats the prompt in Gemma 4's native ChatML format
+    input_ids = processor.apply_chat_template(
         messages,
+        tokenize              = True,
         add_generation_prompt = True,
+        return_tensors        = "pt",
     ).to(DEVICE)
     with torch.no_grad():
         output_ids = model.generate(
+            input_ids,
             max_new_tokens = MAX_NEW_TOKENS,
             temperature    = 0.7,
             top_p          = 0.9,
             do_sample      = True,
+            eos_token_id   = processor.eos_token_id,   # stop at end-of-turn token
+            pad_token_id   = processor.eos_token_id,
         )
+    new_tokens = output_ids[0][input_ids.shape[1]:]
     return processor.decode(new_tokens, skip_special_tokens=True).strip()
         yield "", "⚠️ The model is not loaded. Cannot process the image."
         return
+    drug_name = extract_drug_name_from_image(image, model, vision_processor, device=DEVICE)
     if not drug_name:
         yield (

function_calling.py CHANGED Viewed

@@ -111,7 +111,7 @@ def _extract_field(record: Optional[dict], *field_names: str) -> str:
         if val:
             raw = val[0] if isinstance(val, list) else val
             # Collapse excessive whitespace from FDA's raw text
-            return re.sub(r"\s+", " ", str(raw)).strip()[:600]
     return ""
@@ -202,6 +202,41 @@ def get_geriatric_warnings(drug_name: str) -> str:
     return ". ".join(relevant[:3]) + "." if relevant else ""
 def get_lifestyle_warnings(drug_list: list[str]) -> dict:
     """
     Extract food, alcohol, and lifestyle interaction warnings for a list of drugs.
@@ -253,8 +288,7 @@ def get_lifestyle_warnings(drug_list: list[str]) -> dict:
             for sentence in sentences:
                 sentence_lower = sentence.lower()
                 if any(kw in sentence_lower for kw in keywords):
-                    # Truncate long sentences
-                    clean = sentence[:200].strip()
                     if clean and clean not in hits:
                         hits.append(clean)
             if hits:

         if val:
             raw = val[0] if isinstance(val, list) else val
             # Collapse excessive whitespace from FDA's raw text
+            return re.sub(r"\s+", " ", str(raw)).strip()[:2000]
     return ""
     return ". ".join(relevant[:3]) + "." if relevant else ""
+_SECTION_HEADER_RE = re.compile(r"^\s*\d+(?:\.\d+)?\s+[A-Z][A-Z\s]+")   # "7 DRUG INTERACTIONS"
+_PAREN_REF_RE      = re.compile(r"\(\s*\d+(?:\.\d+)?\s*\)")              # "( 5.1 )"
+_LEADING_NUM_RE    = re.compile(r"^\s*\d+(?:\.\d+)?\s+")                 # "2 DOSAGE..."
+def _clean_lifestyle_sentence(sentence: str) -> str:
+    """
+    Strip FDA formatting artifacts from a single sentence before display.
+    Returns "" if the sentence is just a section header with no useful content.
+    """
+    # Reject pure section headers like "7 DRUG INTERACTIONS"
+    if _SECTION_HEADER_RE.match(sentence) and len(sentence.split()) <= 5:
+        return ""
+    # Strip inline section references like "( 5.1 )" or "( 2 )"
+    sentence = _PAREN_REF_RE.sub("", sentence)
+    # Strip leading section numbers like "2 DOSAGE AND ADMINISTRATION"
+    sentence = _LEADING_NUM_RE.sub("", sentence)
+    # Collapse whitespace and truncate
+    sentence = re.sub(r"\s+", " ", sentence).strip()
+    # Reject if too short after cleaning or still looks like a header (all caps)
+    if len(sentence) < 20 or sentence.isupper():
+        return ""
+    # Drop sentences that are too long to be a single clean thought
+    # (likely mid-paragraph FDA text split at a bad boundary)
+    if len(sentence) > 180:
+        return ""
+    return sentence
 def get_lifestyle_warnings(drug_list: list[str]) -> dict:
     """
     Extract food, alcohol, and lifestyle interaction warnings for a list of drugs.
             for sentence in sentences:
                 sentence_lower = sentence.lower()
                 if any(kw in sentence_lower for kw in keywords):
+                    clean = _clean_lifestyle_sentence(sentence)
                     if clean and clean not in hits:
                         hits.append(clean)
             if hits: