Spaces:
Runtime error
Runtime error
| from pptx import Presentation | |
| import fitz # PyMuPDF | |
| from PIL import Image | |
| import pytesseract | |
| import json | |
| import io | |
| import os | |
| from docx import Document | |
| import tempfile | |
| from pdf2image import convert_from_path | |
| from mistralai import Mistral | |
| from io import BytesIO | |
| import base64 | |
| def preprocess_text(text): | |
| """Preprocess the extracted text to remove unwanted characters and excess whitespace.""" | |
| # Remove newlines and multiple spaces | |
| cleaned_text = ' '.join(text.split()) | |
| return cleaned_text | |
| def convert_pptx_to_json(input_file, file_name): | |
| slides_data = [] | |
| prs = Presentation(input_file) | |
| for idx, slide in enumerate(prs.slides, start=1): | |
| slide_info = { | |
| "slide_number": idx, | |
| "text": "", | |
| "shapes": [] | |
| } | |
| for shape in slide.shapes: | |
| shape_data = {"shape_type": shape.shape_type, "text": ""} | |
| # Если объект — текстовый | |
| if hasattr(shape, "text") and shape.text.strip(): | |
| slide_info["text"] += shape.text + "\n" | |
| shape_data["text"] = shape.text | |
| # Если объект — изображение, используем OCR | |
| if shape.shape_type == 13: # MSO_SHAPE_TYPE.PICTURE = 13 | |
| image_stream = io.BytesIO(shape.image.blob) | |
| pillow_image = Image.open(image_stream) | |
| extracted_text = pytesseract.image_to_string(pillow_image, lang='rus+eng') | |
| slide_info["text"] += extracted_text + "\n" | |
| slide_info["shapes"].append(shape_data) | |
| slides_data.append(slide_info) | |
| return {'file_name': file_name, 'slides': slides_data} | |
| def convert_pdf_to_json(input_file, api_key): | |
| client = Mistral(api_key=api_key) | |
| def encode_image(image_obj): | |
| if isinstance(image_obj, Image.Image): | |
| img = image_obj | |
| else: | |
| img = Image.open(image_obj) | |
| buffered = io.BytesIO() | |
| img.save(buffered, format="PNG") | |
| return base64.b64encode(buffered.getvalue()).decode("utf-8") | |
| def describe_image(image_path): | |
| image_base64 = encode_image(image_path) | |
| prompt = """ | |
| Please extract all the text from the picture. | |
| Don't add anything from yourself. If there is no any text, give me an empty answer. | |
| """ | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "text", "text": prompt}, | |
| {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}} | |
| ] | |
| } | |
| ] | |
| response = client.chat.complete( | |
| model="pixtral-large-latest", | |
| messages=messages, | |
| max_tokens=600 | |
| ) | |
| return response.choices[0].message.content | |
| results = [] | |
| with tempfile.TemporaryDirectory() as temp_dir: | |
| images = convert_from_path(input_file) | |
| for i, image in enumerate(images): | |
| image_path = os.path.join(temp_dir, f'page_{i + 1}.png') | |
| image.save(image_path, 'PNG') | |
| try: | |
| description = describe_image(image_path) | |
| if description: | |
| results.append(description) | |
| except Exception as e: | |
| print(f"Ошибка обработки {image_path}: {e}") | |
| return {"file_name": os.path.basename(input_file), "text": results} | |
| def convert_docx_to_json(input_file, file_name): | |
| paragraphs_data = [] | |
| doc = Document(input_file) | |
| for para_idx, paragraph in enumerate(doc.paragraphs, start=1): | |
| paragraph_info = { | |
| "paragraph_number": para_idx, | |
| "text": paragraph.text | |
| } | |
| paragraphs_data.append(paragraph_info) | |
| return {"file_name": file_name, "paragraphs": paragraphs_data} |