Spaces:
Sleeping
Sleeping
File size: 1,400 Bytes
0f1d294 40c25a8 0f1d294 f3d805b 0f1d294 40c25a8 0f1d294 40c25a8 0f1d294 40c25a8 f3d805b 40c25a8 01c821e 0f1d294 f3d805b 0f1d294 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
import fitz # PyMuPDF
from transformers import pipeline, AutoTokenizer
generator = pipeline("text-generation", model="tiiuae/falcon-rw-1b")
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-rw-1b")
def extract_relevant_text(pdf_path):
doc = fitz.open(pdf_path)
text = ""
for page in doc:
page_text = page.get_text()
for line in page_text.splitlines():
if "Terpenes" in line or "Cannabinoids" in line:
text += line + "\n"
elif any(sub in line for sub in ["mg/g", "%", "THC", "CBD", "Myrcene", "Limonene", "Caryophyllene", "Humulene", "Linalool", "Pinene", "Ocimene"]):
text += line + "\n"
return text
def analyze_pdf(pdf_path):
text = extract_relevant_text(pdf_path)
prefix = (
"Eres un experto en cannabis medicinal. Analiza los siguientes datos del análisis de una cepa de cannabis. "
"Describe sus efectos, usos terapéuticos y el perfil del strain según sus niveles de terpenos y cannabinoides.\n\n"
)
full_input = prefix + text
tokens = tokenizer(full_input, truncation=True, max_length=1024, return_tensors="pt")
truncated_input = tokenizer.decode(tokens["input_ids"][0], skip_special_tokens=True)
result = generator(truncated_input, max_new_tokens=300, do_sample=True)
return result[0]['generated_text'].split("Interpretación:")[-1].strip()
|