Spaces:
Sleeping
Sleeping
| import exifread | |
| import PyPDF2 | |
| from docx import Document | |
| import io | |
| import logging | |
| import pytesseract | |
| import cv2 | |
| import numpy as np | |
| logger = logging.getLogger("deep_file_extractor") | |
| def extract_image_metadata(image_bytes): | |
| """Extracts EXIF metadata and runs OCR on image bytes""" | |
| try: | |
| # Pass 1: Extract Metadata | |
| tags = exifread.process_file(io.BytesIO(image_bytes)) | |
| metadata = {} | |
| for tag, value in tags.items(): | |
| if tag not in ('JPEGThumbnail', 'TIFFThumbnail', 'Filename', 'EXIF MakerNote'): | |
| metadata[tag] = str(value) | |
| # Pass 2: Extract Text via OCR | |
| text = "" | |
| try: | |
| np_arr = np.frombuffer(image_bytes, np.uint8) | |
| img = cv2.imdecode(np_arr, cv2.IMREAD_COLOR) | |
| if img is not None: | |
| # Preprocess for better OCR (grayscale) | |
| gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) | |
| text = pytesseract.image_to_string(gray).strip() | |
| if text: | |
| logger.info("[God-Tier AI] OCR successfully extracted text from image.") | |
| except Exception as ocr_e: | |
| logger.warning(f"OCR failed on image: {ocr_e}") | |
| return {"metadata": metadata, "text": text} | |
| except Exception as e: | |
| logger.error(f"Error extracting image data: {e}") | |
| return {"metadata": {}, "text": ""} | |
| def extract_pdf_data(pdf_bytes): | |
| """Extracts metadata AND raw text from PDF bytes""" | |
| try: | |
| reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes)) | |
| metadata = {} | |
| text = "" | |
| if reader.metadata: | |
| for key, value in reader.metadata.items(): | |
| metadata[key] = str(value) | |
| metadata["pages"] = len(reader.pages) | |
| for page in reader.pages: | |
| extracted = page.extract_text() | |
| if extracted: | |
| text += extracted + "\n" | |
| return {"metadata": metadata, "text": text.strip()} | |
| except Exception as e: | |
| logger.error(f"Error extracting PDF data: {e}") | |
| return {"metadata": {}, "text": ""} | |
| def extract_docx_data(docx_bytes): | |
| """Extracts text from Word documents""" | |
| try: | |
| doc = Document(io.BytesIO(docx_bytes)) | |
| text = "\n".join([paragraph.text for paragraph in doc.paragraphs]) | |
| return {"metadata": {"format": "docx"}, "text": text.strip()} | |
| except Exception as e: | |
| logger.error(f"Error extracting DOCX data: {e}") | |
| return {"metadata": {}, "text": ""} | |
| def process_file_data(file_bytes, content_type): | |
| """Routes file to correct deep extractor based on content type""" | |
| if "image" in content_type: | |
| return extract_image_metadata(file_bytes) | |
| elif "pdf" in content_type: | |
| return extract_pdf_data(file_bytes) | |
| elif "document" in content_type or "docx" in content_type: | |
| return extract_docx_data(file_bytes) | |
| return {"metadata": {}, "text": ""} | |