Spaces:
Sleeping
Sleeping
File size: 4,676 Bytes
fd027e9 b927a18 fd027e9 b927a18 fd027e9 4bcd249 09a2755 1b379e9 4bcd249 1b379e9 09a2755 4bcd249 1b379e9 09a2755 4bcd249 1b379e9 4bcd249 1b379e9 4bcd249 09a2755 b927a18 4bcd249 b927a18 4bcd249 d854934 4bcd249 b927a18 b121807 4bcd249 fd027e9 4bcd249 09a2755 4bcd249 fd027e9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | import base64
import json
import io
import os
from PIL import Image
from typing import Dict, Any, List, Optional
try:
from transformers import pipeline
HAS_TRANSFORMERS = True
except ImportError:
HAS_TRANSFORMERS = False
print("Warning: transformers not found. Using mock extraction.")
from models import LabReportData, Cannabinoid, Terpene
# Simulation of Hugging Face pipeline for document understanding
# In reality, this would use: pipeline("document-question-answering", model="impira/layoutlm-document-qa")
# Or a multimodal model like Donut: model="naver-clova-ix/donut-base-finetuned-docvqa"
class LabReportParser:
def __init__(self, use_remote_api: bool = False, hf_token: Optional[str] = None):
self.use_remote_api = use_remote_api
self.hf_token = hf_token or os.getenv("HF_TOKEN")
async def extract_data(self, file_content: str, file_name: str) -> LabReportData:
# 1. Prepare Images and OCR Text
raw_text = ""
image_to_process_b64 = None
pil_image = None
try:
import pytesseract
image_data = base64.b64decode(file_content)
if file_name.lower().endswith('.pdf'):
from pdf2image import convert_from_bytes
images = convert_from_bytes(image_data)
if images:
pil_image = images[0]
# Also extract text from all pages to be safe
for img in images[:2]: # First 2 pages
raw_text += pytesseract.image_to_string(img)
else:
pil_image = Image.open(io.BytesIO(image_data))
raw_text = pytesseract.image_to_string(pil_image)
if pil_image:
buffered = io.BytesIO()
pil_image.save(buffered, format="JPEG", quality=95)
image_to_process_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
except Exception as e:
print(f"OCR/Image Pre-processing Error: {e}")
# 2. Extract Using RegEx (Fast & Reliable for Numbers)
import re
thc_val = 0.0
cbd_val = 0.0
# Look for THC/CBD patterns
thc_match = re.search(r'(?:Total\s*THC|THC\s*Total|Potency)[:\s]*([\d\.]+)', raw_text, re.I)
if thc_match:
try: thc_val = float(thc_match.group(1))
except: pass
cbd_match = re.search(r'(?:Total\s*CBD|CBD\s*Total)[:\s]*([\d\.]+)', raw_text, re.I)
if cbd_match:
try: cbd_val = float(cbd_match.group(1))
except: pass
# 3. Use Inference API for Naming (Context is better there)
strain_name = file_name.replace(".pdf", "").replace(".png", "").replace(".jpg", "").title()
if self.hf_token and image_to_process_b64:
try:
import requests
API_URL = "https://api-inference.huggingface.co/models/impira/layoutlm-document-qa"
headers = {"Authorization": f"Bearer {self.hf_token}"}
payload = {"inputs": {"image": image_to_process_b64, "question": "What is the strain name?"}}
resp = requests.post(API_URL, headers=headers, json=payload).json()
if isinstance(resp, list) and len(resp) > 0:
answer = resp[0].get("answer", "").title()
if len(answer) > 2 and "unknown" not in answer.lower():
strain_name = answer
except: pass
# 4. Final Data Assembly
print(f"DEBUG: OCR Extraction - THC: {thc_val}%, Name: {strain_name}")
# If OCR got data, or we use a smart fallback
return LabReportData(
strain_name=strain_name,
strain_type="Hybrid", # default
cannabinoids=[
Cannabinoid(name="Total THC", value=thc_val, unit="%"),
Cannabinoid(name="Total CBD", value=cbd_val, unit="%")
],
terpenes=[],
file_name=file_name,
confidence=0.8 if thc_val > 0 else 0.5,
source_type="ai_hybrid"
)
def _empty_extraction(self, file_name: str) -> LabReportData:
# Not needed anymore as extract_data is now more robust, but kept for compatibility
return LabReportData(
strain_name=file_name.split(".")[0].title(),
cannabinoids=[Cannabinoid(name="Total THC", value=0.0)],
confidence=0.0,
file_name=file_name,
source_type="error"
)
async def normalize(self, data: Dict[str, Any]) -> LabReportData:
return LabReportData(**data)
|