from pptx import Presentation import fitz # PyMuPDF from PIL import Image import pytesseract import json import io import os from docx import Document import tempfile from pdf2image import convert_from_path from mistralai import Mistral from io import BytesIO import base64 def preprocess_text(text): """Preprocess the extracted text to remove unwanted characters and excess whitespace.""" # Remove newlines and multiple spaces cleaned_text = ' '.join(text.split()) return cleaned_text def convert_pptx_to_json(input_file, file_name): slides_data = [] prs = Presentation(input_file) for idx, slide in enumerate(prs.slides, start=1): slide_info = { "slide_number": idx, "text": "", "shapes": [] } for shape in slide.shapes: shape_data = {"shape_type": shape.shape_type, "text": ""} # Если объект — текстовый if hasattr(shape, "text") and shape.text.strip(): slide_info["text"] += shape.text + "\n" shape_data["text"] = shape.text # Если объект — изображение, используем OCR if shape.shape_type == 13: # MSO_SHAPE_TYPE.PICTURE = 13 image_stream = io.BytesIO(shape.image.blob) pillow_image = Image.open(image_stream) extracted_text = pytesseract.image_to_string(pillow_image, lang='rus+eng') slide_info["text"] += extracted_text + "\n" slide_info["shapes"].append(shape_data) slides_data.append(slide_info) return {'file_name': file_name, 'slides': slides_data} def convert_pdf_to_json(input_file, api_key): client = Mistral(api_key=api_key) def encode_image(image_obj): if isinstance(image_obj, Image.Image): img = image_obj else: img = Image.open(image_obj) buffered = io.BytesIO() img.save(buffered, format="PNG") return base64.b64encode(buffered.getvalue()).decode("utf-8") def describe_image(image_path): image_base64 = encode_image(image_path) prompt = """ Please extract all the text from the picture. Don't add anything from yourself. If there is no any text, give me an empty answer. """ messages = [ { "role": "user", "content": [ {"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}} ] } ] response = client.chat.complete( model="pixtral-large-latest", messages=messages, max_tokens=600 ) return response.choices[0].message.content results = [] with tempfile.TemporaryDirectory() as temp_dir: images = convert_from_path(input_file) for i, image in enumerate(images): image_path = os.path.join(temp_dir, f'page_{i + 1}.png') image.save(image_path, 'PNG') try: description = describe_image(image_path) if description: results.append(description) except Exception as e: print(f"Ошибка обработки {image_path}: {e}") return {"file_name": os.path.basename(input_file), "text": results} def convert_docx_to_json(input_file, file_name): paragraphs_data = [] doc = Document(input_file) for para_idx, paragraph in enumerate(doc.paragraphs, start=1): paragraph_info = { "paragraph_number": para_idx, "text": paragraph.text } paragraphs_data.append(paragraph_info) return {"file_name": file_name, "paragraphs": paragraphs_data}