Spaces:
Sleeping
Sleeping
| import fitz # PyMuPDF | |
| from transformers import pipeline, AutoTokenizer | |
| generator = pipeline("text-generation", model="tiiuae/falcon-rw-1b") | |
| tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-rw-1b") | |
| def extract_relevant_text(pdf_path): | |
| doc = fitz.open(pdf_path) | |
| text = "" | |
| for page in doc: | |
| page_text = page.get_text() | |
| for line in page_text.splitlines(): | |
| if "Terpenes" in line or "Cannabinoids" in line: | |
| text += line + "\n" | |
| elif any(sub in line for sub in ["mg/g", "%", "THC", "CBD", "Myrcene", "Limonene", "Caryophyllene", "Humulene", "Linalool", "Pinene", "Ocimene"]): | |
| text += line + "\n" | |
| return text | |
| def analyze_pdf(pdf_path): | |
| text = extract_relevant_text(pdf_path) | |
| prefix = ( | |
| "Eres un experto en cannabis medicinal. Analiza los siguientes datos del análisis de una cepa de cannabis. " | |
| "Describe sus efectos, usos terapéuticos y el perfil del strain según sus niveles de terpenos y cannabinoides.\n\n" | |
| ) | |
| full_input = prefix + text | |
| tokens = tokenizer(full_input, truncation=True, max_length=1024, return_tensors="pt") | |
| truncated_input = tokenizer.decode(tokens["input_ids"][0], skip_special_tokens=True) | |
| result = generator(truncated_input, max_new_tokens=300, do_sample=True) | |
| return result[0]['generated_text'].split("Interpretación:")[-1].strip() | |