Spaces:
Sleeping
Sleeping
| from mistralai import Mistral | |
| from mistralai.models import OCRResponse | |
| from gradio import File | |
| OCR_MODEL = "mistral-ocr-latest" | |
| CHAT_MODEL = "mistral-large-latest" | |
| def ocr_from_file(file_path, api_key: str, mode="image"): | |
| if not api_key: | |
| raise ValueError("Mistral API Key is required.") | |
| try: | |
| client = Mistral(api_key=api_key) | |
| except Exception as e: | |
| raise ValueError("API invalid.") | |
| uploaded_image = client.files.upload( | |
| file={ | |
| "file_name": file_path, | |
| "content": open(file_path, "rb"), | |
| }, | |
| purpose="ocr" | |
| ) | |
| signed_url = client.files.get_signed_url(file_id=uploaded_image.id) | |
| if mode == "image": | |
| ocr_response = client.ocr.process( | |
| model=OCR_MODEL, | |
| document={ | |
| "type": "image_url", | |
| "image_url": signed_url.url, | |
| }, | |
| include_image_base64=True | |
| ) | |
| elif mode == "pdf": | |
| ocr_response = client.ocr.process( | |
| model=OCR_MODEL, | |
| document={ | |
| "type": "document_url", | |
| "document_url": signed_url.url, | |
| }, | |
| include_image_base64=True | |
| ) | |
| return ocr_response | |
| def get_combined_markdown(ocr_response: OCRResponse) -> str: | |
| markdowns: list[str] = [] | |
| for page in ocr_response.pages: | |
| markdowns.append(page.markdown) | |
| return "\n\n".join(markdowns) | |
| def correct_text_with_ai(text: str, api_key: str) -> str: | |
| if not api_key: | |
| raise ValueError("Mistral API Key is required.") | |
| try: | |
| client = Mistral(api_key=api_key) | |
| except Exception as e: | |
| return f"ERROR: {str(e)}" | |
| response = client.chat.complete( | |
| model=CHAT_MODEL, | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": | |
| """You are an expert proofreader specializing in Markdown formatting and OCR error correction. Your task is to meticulously review provided Markdown text that has been generated via OCR. | |
| Your primary goal is to identify and correct **typographical errors, spelling mistakes, and redundant symbols** that are clearly a result of the OCR process. | |
| Additionally, you must correct any illogical or jumbled line breaks to ensure proper Markdown paragraph formatting. | |
| **Crucially, you must NOT alter the original meaning or content of the text.** Your corrections should be limited to: | |
| * Obvious OCR-induced spelling errors | |
| * Erroneous or redundant symbols | |
| * Markdown formatting errors | |
| * Jumbled or incorrect line breaks for proper paragraphing | |
| After your thorough review, output the carefully corrected Markdown text. JUST the text.""" | |
| }, | |
| { | |
| "role": "user", | |
| "content": text | |
| }, | |
| ], | |
| temperature=0.1, | |
| ) | |
| return(response.choices[0].message.content) | |
| def perform_raw_ocr(input_file: File, api_key: str): | |
| if input_file and input_file.name: | |
| file_ext = input_file.name.split('.')[-1].lower() | |
| else: | |
| return "File/Text not found" | |
| if file_ext == "txt": | |
| with open(input_file, "r", encoding="utf-8") as f: | |
| return f.read() | |
| elif file_ext == "pdf": | |
| file_type = "pdf" | |
| else: | |
| file_type = "image" | |
| response = ocr_from_file(input_file, api_key, file_type) | |
| res_text = get_combined_markdown(response) | |
| return res_text | |