HyperMega-C2-API / metadata.py
Wrzzzrzr's picture
Upload folder using huggingface_hub
9cb2420 verified
import exifread
import PyPDF2
from docx import Document
import io
import logging
import pytesseract
import cv2
import numpy as np
logger = logging.getLogger("deep_file_extractor")
def extract_image_metadata(image_bytes):
"""Extracts EXIF metadata and runs OCR on image bytes"""
try:
# Pass 1: Extract Metadata
tags = exifread.process_file(io.BytesIO(image_bytes))
metadata = {}
for tag, value in tags.items():
if tag not in ('JPEGThumbnail', 'TIFFThumbnail', 'Filename', 'EXIF MakerNote'):
metadata[tag] = str(value)
# Pass 2: Extract Text via OCR
text = ""
try:
np_arr = np.frombuffer(image_bytes, np.uint8)
img = cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
if img is not None:
# Preprocess for better OCR (grayscale)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
text = pytesseract.image_to_string(gray).strip()
if text:
logger.info("[God-Tier AI] OCR successfully extracted text from image.")
except Exception as ocr_e:
logger.warning(f"OCR failed on image: {ocr_e}")
return {"metadata": metadata, "text": text}
except Exception as e:
logger.error(f"Error extracting image data: {e}")
return {"metadata": {}, "text": ""}
def extract_pdf_data(pdf_bytes):
"""Extracts metadata AND raw text from PDF bytes"""
try:
reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
metadata = {}
text = ""
if reader.metadata:
for key, value in reader.metadata.items():
metadata[key] = str(value)
metadata["pages"] = len(reader.pages)
for page in reader.pages:
extracted = page.extract_text()
if extracted:
text += extracted + "\n"
return {"metadata": metadata, "text": text.strip()}
except Exception as e:
logger.error(f"Error extracting PDF data: {e}")
return {"metadata": {}, "text": ""}
def extract_docx_data(docx_bytes):
"""Extracts text from Word documents"""
try:
doc = Document(io.BytesIO(docx_bytes))
text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
return {"metadata": {"format": "docx"}, "text": text.strip()}
except Exception as e:
logger.error(f"Error extracting DOCX data: {e}")
return {"metadata": {}, "text": ""}
def process_file_data(file_bytes, content_type):
"""Routes file to correct deep extractor based on content type"""
if "image" in content_type:
return extract_image_metadata(file_bytes)
elif "pdf" in content_type:
return extract_pdf_data(file_bytes)
elif "document" in content_type or "docx" in content_type:
return extract_docx_data(file_bytes)
return {"metadata": {}, "text": ""}