File size: 5,947 Bytes
f0b69ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
"""
T9 Oracle GLiNER Entity Extractor - HF Space Deployment
Gradio API endpoint for zero-shot NER with 70 medical device labels
Deployed on: Persistent T4 GPU
Model: urchade/gliner_large-v2.1 (1.7GB)
Cost: $0.60/hour
"""
import gradio as gr
import json
import logging
from typing import List, Dict
from gliner import GLiNER
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# 70 APPROVED ENTITY LABELS (from T9 configuration)
ENTITY_LABELS = [
# Tier 1: Critical Identifiers (4)
"part_number", "component_name", "manufacturer", "model_number",
# Tier 2: Specifications & Measurements (13)
"pressure", "temperature", "voltage", "current", "material",
"dimension", "weight", "volume", "flow_rate", "power",
"diameter", "length", "thickness",
# Tier 3: Standards & Compliance (4)
"standard_reference", "certification", "compliance", "safety_class",
# Tier 4: Geometry & Mechanical (11)
"thread_standard", "pipe_size", "tubing_size", "connector_type",
"surface_finish", "surface_treatment", "width", "height",
"tolerance", "hardness", "torque",
# Tier 5: Documentation (7)
"diagram_reference", "drawing_number", "procedure_number",
"test_protocol", "revision", "sku_number", "part_label",
# Tier 6: Operational Parameters (9)
"accuracy", "speed", "frequency", "resistance",
"operating_temperature", "supply_voltage", "response_time",
"duty_cycle", "operating_range",
# Tier 7: Manufacturing (8)
"operator_id", "tool_number", "gauge_id", "fixture_number",
"machine_id", "lot_number", "serial_number", "batch_id",
# Tier 8: Medical Device (7)
"medical_device", "scope_manufacturer", "channel_type",
"port_type", "hub_type", "color_code", "leak_test",
# Tier 9: Visual Elements (2)
"diagram_type", "technical_annotation",
# Tier 10: Quality & Maintenance (7)
"calibration_interval", "service_interval", "mtbf",
"warranty", "expiration_date", "production_date", "inspection_report"
]
# Load GLiNER model (runs once on Space startup)
logger.info("Loading GLiNER Large model (1.7GB)...")
model = GLiNER.from_pretrained("urchade/gliner_large-v2.1")
logger.info(f"✓ GLiNER loaded with {len(ENTITY_LABELS)} labels")
def extract_entities(text: str, max_length: int = 10000) -> str:
"""
Extract entities from text using GLiNER zero-shot NER
Args:
text: Input text (max 10,000 characters recommended)
max_length: Maximum text length per prediction
Returns:
JSON string with extracted entities
"""
if not text or not text.strip():
return json.dumps({"entities": [], "error": "Empty text provided"})
# Truncate if too long
if len(text) > max_length:
logger.warning(f"Text truncated from {len(text)} to {max_length} chars")
text = text[:max_length]
try:
# GLiNER prediction
predictions = model.predict_entities(text, ENTITY_LABELS)
# Format output
entities = []
for pred in predictions:
entities.append({
"text": pred.get("text", ""),
"label": pred.get("label", ""),
"start": pred.get("start", 0),
"end": pred.get("end", 0),
"score": float(pred.get("score", 0.0))
})
logger.info(f"Extracted {len(entities)} entities from {len(text)} chars")
return json.dumps({
"entities": entities,
"input_length": len(text),
"entity_count": len(entities),
"labels_used": len(ENTITY_LABELS)
}, indent=2)
except Exception as e:
logger.error(f"Extraction failed: {e}")
return json.dumps({"entities": [], "error": str(e)})
def batch_extract(text_batch: str) -> str:
"""
Extract entities from multiple texts (newline-separated)
Args:
text_batch: Multiple texts separated by double newlines
Returns:
JSON string with results for each text
"""
texts = [t.strip() for t in text_batch.split("\n\n") if t.strip()]
results = []
for i, text in enumerate(texts):
result_json = extract_entities(text)
result = json.loads(result_json)
result["text_index"] = i
results.append(result)
return json.dumps({"results": results, "batch_size": len(texts)}, indent=2)
# Create Gradio interface
demo = gr.Interface(
fn=extract_entities,
inputs=[
gr.Textbox(
lines=10,
placeholder="Enter technical text here (max 10,000 chars)...",
label="Input Text"
)
],
outputs=gr.JSON(label="Extracted Entities"),
title="T9 Oracle Entity Extractor (GLiNER Large)",
description=f"""
**Zero-shot NER for Medical Device Technical Documentation**
Extracts **{len(ENTITY_LABELS)} entity types** across 10 tiers:
- Part numbers, dimensions, materials, standards
- Electrical specs, pressure, temperature, flow rates
- Thread standards, tolerances, surface treatments
- Medical device specific (scopes, channels, colors)
- Quality & maintenance data
**Model:** GLiNER Large v2.1 (1.7GB)
**Hardware:** NVIDIA T4 GPU (16GB VRAM)
**Max input:** 10,000 characters per request
""",
examples=[
["Part Number: A70002-2, Material: SS316L, Pressure: 60 psi, Thread: 1/4\" NPT"],
["Standard: ISO 1179-2, ASTM A112, Temperature: -40 to 85°C, Dimension: 6mm x 35mm"],
["Manufacturer: Olympus, Channel: Biopsy, Color: Orange Tubing, Serial: SN-123456"]
],
api_name="extract", # Important: enables API access
allow_flagging="never"
)
# Launch with API enabled
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False # HF Spaces handles sharing
)
|