Spaces:
Sleeping
Sleeping
| from gradio import Interface, File, Dropdown, Textbox, Slider | |
| import json | |
| from gliner import GLiNER | |
| from doctr.io import DocumentFile | |
| from doctr.models import ocr_predictor | |
| class DoctrHandler: | |
| def __init__(self): | |
| self.model = ocr_predictor(det_arch="fast_base", reco_arch="crnn_vgg16_bn", pretrained=True) | |
| def extract_text(self, file_path): | |
| try: | |
| # Handle both PDF and image files | |
| doc = DocumentFile.from_pdf(file_path) if file_path.endswith('.pdf') else DocumentFile.from_images(file_path) | |
| # Perform OCR | |
| result = self.model(doc) | |
| # Extract text from result | |
| text = "" | |
| for page in result.pages: | |
| for block in page.blocks: | |
| for line in block.lines: | |
| for word in line.words: | |
| text += word.value + " " | |
| return text.strip() | |
| except Exception as e: | |
| raise Exception(f"Error during OCR processing: {str(e)}") | |
| class GlinerHandler: | |
| def __init__(self): | |
| self.max_length = 384 | |
| self.model = GLiNER.from_pretrained("urchade/gliner_multi-v2.1", max_length=self.max_length) | |
| def predict_entities(self, text, labels, threshold): | |
| entities = self.model.predict_entities(text, labels, threshold=threshold) | |
| return entities | |
| # Initialize handlers | |
| ocr_handler = DoctrHandler() | |
| ner_handler = GlinerHandler() | |
| # Default entities | |
| DEFAULT_ENTITIES = ["name", "person", "bank account number", "email", "address", "phone number", "date", "currency", "amount", "document number", "iban", "country"] | |
| def process_file(uploaded_file, selected_entities, custom_entities, threshold=0.5): | |
| # Input validation | |
| if not selected_entities and not custom_entities: | |
| return json.dumps({ | |
| "message": "Please select or provide at least one entity to search for", | |
| "hits": 0, | |
| "searched_for": [], | |
| "entities": [] | |
| }, indent=4) | |
| # Handle no file uploaded | |
| if not uploaded_file: | |
| return json.dumps({ | |
| "message": "No file uploaded", | |
| "hits": 0, | |
| "searched_for": [], | |
| "entities": [] | |
| }, indent=4) | |
| # Convert custom entities string to list and clean whitespace | |
| custom_entity_list = [e.strip() for e in custom_entities.split(",") if e.strip()] if custom_entities else [] | |
| # Combine default and custom entities | |
| all_entities = selected_entities + custom_entity_list | |
| # Perform OCR on the uploaded file | |
| extracted_text = ocr_handler.extract_text(uploaded_file.name) | |
| # Perform NER on the extracted text with threshold | |
| entities = ner_handler.predict_entities(extracted_text, all_entities, threshold) | |
| if not entities: | |
| return json.dumps({ | |
| "message": "No entities were found in the document", | |
| "hits": 0, | |
| "searched_for": all_entities, | |
| "entities": [] | |
| }, indent=4) | |
| # Clean and sort entities | |
| cleaned_entities = [] | |
| for entity in entities: | |
| cleaned_entity = { | |
| "text": entity["text"], | |
| "label": entity["label"], | |
| "confidence": entity["score"] | |
| } | |
| cleaned_entities.append(cleaned_entity) | |
| # Sort by confidence score in descending order | |
| cleaned_entities.sort(key=lambda x: x["confidence"], reverse=True) | |
| # Return structured response | |
| response = { | |
| "message": "Document destroyed successfully!", | |
| "hits": len(cleaned_entities), | |
| "searched_for": all_entities, | |
| "entities": cleaned_entities | |
| } | |
| return json.dumps(response, indent=4) | |
| # Create Gradio interface | |
| iface = Interface( | |
| fn=process_file, | |
| inputs=[ | |
| File(label="Upload Document (PDF or Image)"), | |
| Dropdown(choices=DEFAULT_ENTITIES, label="Select Entities", multiselect=True), | |
| Textbox(label="Custom Entities (comma-separated)", placeholder="entity1, entity2, ..."), | |
| Slider(minimum=0.1, maximum=1.0, value=0.5, step=0.1, label="Confidence Threshold") | |
| ], | |
| outputs=Textbox(label="Extracted Entities (JSON)"), | |
| title="DocDestroyer11000", | |
| allow_flagging="never", | |
| description="Extract valuable information from your documents in a snap! Upload your PDFs or images, select the entities you care about et started now and watch your documents be **destroyed** (or in other words - turned into JSON)! π<br>Tech: Copilot/Claude Sonnet + https://mindee.github.io/doctr/ + https://huggingface.co/urchade/gliner_multi-v2.1" | |
| ) | |
| iface.launch() |