Spaces:

VicGerardoPR
/

BudtenderGuide

Sleeping

VicGerardoPR commited on May 17, 2025

Commit

f3d805b

verified ·

1 Parent(s): 2cf8349

Update utils/interpret_lab_pdf.py

Files changed (1) hide show

utils/interpret_lab_pdf.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import fitz  # PyMuPDF
 from transformers import pipeline
 generator = pipeline("text-generation", model="tiiuae/falcon-rw-1b")
 def extract_text_from_pdf(pdf_path):
     doc = fitz.open(pdf_path)
@@ -9,18 +11,27 @@ def extract_text_from_pdf(pdf_path):
     for page in doc:
         text += page.get_text()
     return text
 def analyze_pdf(pdf_path):
     text = extract_text_from_pdf(pdf_path)
-    if len(text) > 3000:
-        text = text[:3000]  # Limita el texto a 3000 caracteres (~1000 tokens)
-    prompt = (
         "Eres un experto en cannabis medicinal. Analiza el siguiente certificado de análisis "
         "y brinda una interpretación clara sobre los efectos, usos potenciales y características del strain basado en los terpenos y cannabinoides. "
-        "No enfatices pesticidas ni contaminantes.\\n\\n"
-        + text +
-        "\\n\\nInterpretación:"
     )
-    result = generator(prompt, max_new_tokens=300, do_sample=True)
     return result[0]['generated_text'].split("Interpretación:")[-1].strip()

 import fitz  # PyMuPDF
 from transformers import pipeline
+from transformers import AutoTokenizer
 generator = pipeline("text-generation", model="tiiuae/falcon-rw-1b")
+tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-rw-1b")
 def extract_text_from_pdf(pdf_path):
     doc = fitz.open(pdf_path)
     for page in doc:
         text += page.get_text()
     return text
 def analyze_pdf(pdf_path):
     text = extract_text_from_pdf(pdf_path)
+    # Prompt fijo
+    prefix = (
         "Eres un experto en cannabis medicinal. Analiza el siguiente certificado de análisis "
         "y brinda una interpretación clara sobre los efectos, usos potenciales y características del strain basado en los terpenos y cannabinoides. "
+        "No enfatices pesticidas ni contaminantes.\n\n"
     )
+    # Codificar prompt + texto completo
+    full_input = prefix + text
+    tokens = tokenizer(full_input, truncation=True, max_length=1024, return_tensors="pt")
+    # Decodificar tokens truncados
+    truncated_input = tokenizer.decode(tokens["input_ids"][0], skip_special_tokens=True)
+    # Ejecutar modelo con input seguro
+    result = generator(truncated_input, max_new_tokens=300, do_sample=True)
     return result[0]['generated_text'].split("Interpretación:")[-1].strip()