import exifread import PyPDF2 from docx import Document import io import logging import pytesseract import cv2 import numpy as np logger = logging.getLogger("deep_file_extractor") def extract_image_metadata(image_bytes): """Extracts EXIF metadata and runs OCR on image bytes""" try: # Pass 1: Extract Metadata tags = exifread.process_file(io.BytesIO(image_bytes)) metadata = {} for tag, value in tags.items(): if tag not in ('JPEGThumbnail', 'TIFFThumbnail', 'Filename', 'EXIF MakerNote'): metadata[tag] = str(value) # Pass 2: Extract Text via OCR text = "" try: np_arr = np.frombuffer(image_bytes, np.uint8) img = cv2.imdecode(np_arr, cv2.IMREAD_COLOR) if img is not None: # Preprocess for better OCR (grayscale) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) text = pytesseract.image_to_string(gray).strip() if text: logger.info("[God-Tier AI] OCR successfully extracted text from image.") except Exception as ocr_e: logger.warning(f"OCR failed on image: {ocr_e}") return {"metadata": metadata, "text": text} except Exception as e: logger.error(f"Error extracting image data: {e}") return {"metadata": {}, "text": ""} def extract_pdf_data(pdf_bytes): """Extracts metadata AND raw text from PDF bytes""" try: reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes)) metadata = {} text = "" if reader.metadata: for key, value in reader.metadata.items(): metadata[key] = str(value) metadata["pages"] = len(reader.pages) for page in reader.pages: extracted = page.extract_text() if extracted: text += extracted + "\n" return {"metadata": metadata, "text": text.strip()} except Exception as e: logger.error(f"Error extracting PDF data: {e}") return {"metadata": {}, "text": ""} def extract_docx_data(docx_bytes): """Extracts text from Word documents""" try: doc = Document(io.BytesIO(docx_bytes)) text = "\n".join([paragraph.text for paragraph in doc.paragraphs]) return {"metadata": {"format": "docx"}, "text": text.strip()} except Exception as e: logger.error(f"Error extracting DOCX data: {e}") return {"metadata": {}, "text": ""} def process_file_data(file_bytes, content_type): """Routes file to correct deep extractor based on content type""" if "image" in content_type: return extract_image_metadata(file_bytes) elif "pdf" in content_type: return extract_pdf_data(file_bytes) elif "document" in content_type or "docx" in content_type: return extract_docx_data(file_bytes) return {"metadata": {}, "text": ""}