| import os
|
| import mimetypes
|
| import PyPDF2
|
| import docx
|
| import cv2
|
| import numpy as np
|
| from PIL import Image
|
| import pytesseract
|
|
|
| def process_image_for_model(image):
|
| """Convert image to base64 for model input"""
|
| if image is None:
|
| return None
|
|
|
|
|
| import io
|
| import base64
|
|
|
|
|
| if isinstance(image, np.ndarray):
|
| image = Image.fromarray(image)
|
|
|
| buffer = io.BytesIO()
|
| image.save(buffer, format='PNG')
|
| img_str = base64.b64encode(buffer.getvalue()).decode()
|
| return f"data:image/png;base64,{img_str}"
|
|
|
| def extract_text_from_image(image_path):
|
| """Extract text from image using OCR"""
|
| try:
|
|
|
| try:
|
| pytesseract.get_tesseract_version()
|
| except Exception:
|
| return "Error: Tesseract OCR is not installed. Please install Tesseract to extract text from images. See install_tesseract.md for instructions."
|
|
|
| image = cv2.imread(image_path)
|
| if image is None:
|
| return "Error: Could not read image file"
|
|
|
| image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
| gray = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2GRAY)
|
| _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
| text = pytesseract.image_to_string(binary, config='--psm 6')
|
| return text.strip() if text.strip() else "No text found in image"
|
|
|
| except Exception as e:
|
| return f"Error extracting text from image: {e}"
|
|
|
| def extract_text_from_file(file_path):
|
| if not file_path:
|
| return ""
|
| ext = os.path.splitext(file_path)[1].lower()
|
| try:
|
| if ext == ".pdf":
|
| with open(file_path, "rb") as f:
|
| reader = PyPDF2.PdfReader(f)
|
| return "\n".join(page.extract_text() or "" for page in reader.pages)
|
| elif ext in [".txt", ".md", ".csv"]:
|
| with open(file_path, "r", encoding="utf-8") as f:
|
| return f.read()
|
| elif ext == ".docx":
|
| doc = docx.Document(file_path)
|
| return "\n".join([para.text for para in doc.paragraphs])
|
| elif ext.lower() in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".gif", ".webp"]:
|
| return extract_text_from_image(file_path)
|
| else:
|
| return ""
|
| except Exception as e:
|
| return f"Error extracting text: {e}"
|
|
|
| def create_multimodal_message(text, image=None):
|
| """Create a multimodal message with text and optional image"""
|
| if image is None:
|
| return {"role": "user", "content": text}
|
|
|
| content = [
|
| {
|
| "type": "text",
|
| "text": text
|
| },
|
| {
|
| "type": "image_url",
|
| "image_url": {
|
| "url": process_image_for_model(image)
|
| }
|
| }
|
| ]
|
|
|
| return {"role": "user", "content": content} |