File size: 10,224 Bytes
bf5067d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 | import os
import base64
import requests
from dotenv import load_dotenv
from typing import Optional, Dict
try:
# mistralai>=1.x
from mistralai import Mistral
except Exception:
try:
# mistralai<1.x
from mistralai.client import MistralClient as Mistral
except Exception:
Mistral = None
load_dotenv()
class ImageOCR:
"""
Extract text from news images using Mistral OCR API.
Useful for analyzing screenshots of news shared on social media.
"""
def __init__(self):
self.api_key = os.getenv('MISTRAL_API_KEY')
self.enabled = False
self.client = None
self.use_http_fallback = False
self.model = "mistral-ocr-latest" # Mistral's OCR model
if not (self.api_key and self.api_key != 'your_api_key_here' and len(self.api_key) > 10):
print("⚠ MISTRAL_API_KEY not configured, image OCR disabled")
return
if Mistral is not None:
try:
self.client = Mistral(api_key=self.api_key)
self.enabled = True
print("✓ Image OCR (Mistral OCR SDK) initialized successfully")
return
except Exception as e:
print(f"⚠ Failed to initialize Mistral OCR SDK: {e}")
# SDK import/init can fail in some cloud images; use direct HTTP API fallback.
self.use_http_fallback = True
self.enabled = True
print("⚠ Mistral SDK unavailable, using direct HTTP OCR fallback")
def extract_text_from_image(self, image_data: bytes, mime_type: str = "image/jpeg") -> Optional[Dict]:
"""
Extract news title and text from an image using Mistral OCR.
Args:
image_data: Raw image bytes
mime_type: Image MIME type (image/jpeg, image/png, etc.)
Returns:
Dict with extracted title, text, and metadata
"""
if not self.enabled:
return None
try:
# Convert to base64
base64_image = base64.b64encode(image_data).decode('utf-8')
return self._call_mistral_ocr(base64_image, mime_type)
except Exception as e:
print(f"Image OCR error: {e}")
return {
"title": "NOT_FOUND",
"text": "NOT_FOUND",
"source": "NOT_FOUND",
"date": "NOT_FOUND",
"error": str(e),
"extraction_success": False
}
def _call_mistral_ocr(self, base64_image: str, mime_type: str) -> Dict:
"""Call Mistral OCR API for text extraction."""
try:
# Use Mistral OCR API with base64 image
image_data_url = f"data:{mime_type};base64,{base64_image}"
if self.client and not self.use_http_fallback:
ocr_response = self.client.ocr.process(
model=self.model,
document={
"type": "image_url",
"image_url": image_data_url
}
)
else:
ocr_response = self._call_mistral_ocr_http(image_data_url)
extracted_text = self._extract_text_from_ocr_response(ocr_response)
extracted_text = extracted_text.strip()
if not extracted_text:
return {
"title": "NOT_FOUND",
"text": "NOT_FOUND",
"source": "NOT_FOUND",
"date": "NOT_FOUND",
"extraction_success": False
}
# Parse the extracted text to find title and content
return self._parse_extracted_text(extracted_text)
except Exception as e:
print(f"Mistral OCR API error: {e}")
return {
"title": "NOT_FOUND",
"text": "NOT_FOUND",
"source": "NOT_FOUND",
"date": "NOT_FOUND",
"error": str(e),
"extraction_success": False
}
def _call_mistral_ocr_http(self, image_data_url: str) -> Dict:
"""Fallback to Mistral OCR REST API if SDK is unavailable."""
if not self.api_key:
raise RuntimeError("MISTRAL_API_KEY is missing")
response = requests.post(
"https://api.mistral.ai/v1/ocr",
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
},
json={
"model": self.model,
"document": {
"type": "image_url",
"image_url": image_data_url,
},
},
timeout=60,
)
response.raise_for_status()
return response.json()
def _extract_text_from_ocr_response(self, ocr_response) -> str:
"""Extract page text from both SDK objects and HTTP JSON responses."""
extracted_text = ""
if isinstance(ocr_response, dict):
pages = ocr_response.get("pages", [])
for page in pages:
markdown = page.get("markdown") if isinstance(page, dict) else None
text = page.get("text") if isinstance(page, dict) else None
if markdown:
extracted_text += markdown + "\n"
elif text:
extracted_text += text + "\n"
return extracted_text.strip()
if ocr_response and hasattr(ocr_response, 'pages'):
for page in ocr_response.pages:
if hasattr(page, 'markdown') and page.markdown:
extracted_text += page.markdown + "\n"
elif hasattr(page, 'text') and page.text:
extracted_text += page.text + "\n"
return extracted_text.strip()
def _parse_extracted_text(self, text: str) -> Dict:
"""Parse OCR extracted text to identify title, content, source, and date."""
lines = [line.strip() for line in text.split('\n') if line.strip()]
if not lines:
return {
"title": "NOT_FOUND",
"text": "NOT_FOUND",
"source": "NOT_FOUND",
"date": "NOT_FOUND",
"extraction_success": False
}
# Heuristic: First substantial line is likely the title
title = "NOT_FOUND"
text_content = "NOT_FOUND"
source = "NOT_FOUND"
date = "NOT_FOUND"
# Find title (first line with significant content)
for i, line in enumerate(lines):
# Skip very short lines or common UI elements
if len(line) > 15 and not any(x in line.lower() for x in ['follow', 'share', 'comment', 'like', 'reply', 'retweet']):
title = line[:300] # Limit title length
# Rest is the text content
remaining_lines = lines[i+1:] if i+1 < len(lines) else []
if remaining_lines:
text_content = ' '.join(remaining_lines)[:2000] # Limit text length
break
# If no title found, use first line
if title == "NOT_FOUND" and lines:
title = lines[0][:300]
if len(lines) > 1:
text_content = ' '.join(lines[1:])[:2000]
# Try to detect source (common news sources)
source_keywords = ['reuters', 'bbc', 'cnn', 'fox', 'nbc', 'abc', 'times', 'post', 'guardian', 'india today', 'ndtv', 'hindu', 'express', 'twitter', 'x.com', 'facebook', 'instagram']
for line in lines:
line_lower = line.lower()
for keyword in source_keywords:
if keyword in line_lower:
source = line[:100]
break
if source != "NOT_FOUND":
break
# Try to detect date patterns
import re
date_patterns = [
r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', # DD/MM/YYYY or similar
r'\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4}', # 26 Feb 2026
r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}', # Feb 26, 2026
]
for pattern in date_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
date = match.group()
break
return {
"title": title,
"text": text_content if text_content != "NOT_FOUND" else title,
"source": source,
"date": date,
"raw_text": text[:3000],
"extraction_success": True
}
def extract_from_base64(self, base64_string: str, mime_type: str = "image/jpeg") -> Optional[Dict]:
"""
Extract text from a base64-encoded image.
Args:
base64_string: Base64 encoded image string
mime_type: Image MIME type
Returns:
Dict with extracted text
"""
if not self.enabled:
return None
try:
# Remove data URL prefix if present
if ',' in base64_string:
base64_string = base64_string.split(',')[1]
return self._call_mistral_ocr(base64_string, mime_type)
except Exception as e:
print(f"Image OCR error: {e}")
return {
"title": "NOT_FOUND",
"text": "NOT_FOUND",
"source": "NOT_FOUND",
"date": "NOT_FOUND",
"error": str(e),
"extraction_success": False
}
# Global instance
image_ocr = ImageOCR()
|