math-chatbot-v2 / src /edurag_math_bot /image_to_text.py
pranshu dhiman
Deploy MathSutra Space
7fab45b
Raw
History Blame Contribute Delete
2.84 kB
from __future__ import annotations
import platform
import shutil
import subprocess
import tempfile
from pathlib import Path
OCR_SWIFT = """
import Foundation
import Vision
func fail(_ message: String) -> Never {
FileHandle.standardError.write(Data((message + "\\n").utf8))
exit(1)
}
guard CommandLine.arguments.count > 1 else {
fail("Missing image path.")
}
let imageURL = URL(fileURLWithPath: CommandLine.arguments[1])
let request = VNRecognizeTextRequest()
request.recognitionLevel = .accurate
request.usesLanguageCorrection = true
if #available(macOS 13.0, *) {
request.automaticallyDetectsLanguage = true
}
do {
let handler = VNImageRequestHandler(url: imageURL, options: [:])
try handler.perform([request])
} catch {
fail(error.localizedDescription)
}
guard let observations = request.results, !observations.isEmpty else {
fail("No readable text was found in the image.")
}
let lines = observations.compactMap { observation in
observation.topCandidates(1).first?.string
}
let cleanedText = lines
.map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
.filter { !$0.isEmpty }
.joined(separator: "\\n")
guard !cleanedText.isEmpty else {
fail("No readable text was found in the image.")
}
print(cleanedText)
"""
def extract_text_from_image_file(image_path: Path, timeout: int = 120) -> str:
if platform.system() != "Darwin":
raise RuntimeError(
"Question image OCR currently works only on macOS because it uses Apple's Vision framework."
)
swift_executable = shutil.which("swift")
if swift_executable is None:
raise RuntimeError("Swift is not installed, so question-image OCR cannot run.")
with tempfile.TemporaryDirectory() as tmp_dir:
script_path = Path(tmp_dir) / "ocr.swift"
script_path.write_text(OCR_SWIFT, encoding="utf-8")
result = subprocess.run(
[swift_executable, str(script_path), str(image_path)],
capture_output=True,
text=True,
timeout=timeout,
check=False,
)
if result.returncode != 0:
message = result.stderr.strip() or result.stdout.strip() or "OCR failed."
raise RuntimeError(message)
extracted_text = result.stdout.strip()
if not extracted_text:
raise RuntimeError("No readable text was found in the image.")
return extracted_text
def extract_text_from_image_bytes(
image_bytes: bytes,
suffix: str = ".png",
timeout: int = 120,
) -> str:
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
tmp_file.write(image_bytes)
image_path = Path(tmp_file.name)
try:
return extract_text_from_image_file(image_path=image_path, timeout=timeout)
finally:
image_path.unlink(missing_ok=True)