File size: 3,956 Bytes
ba9fce6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from pptx import Presentation
import fitz  # PyMuPDF
from PIL import Image
import pytesseract
import json
import io
import os
from docx import Document
import tempfile
from pdf2image import convert_from_path
from mistralai import Mistral
from io import BytesIO
import base64

def preprocess_text(text):
    """Preprocess the extracted text to remove unwanted characters and excess whitespace."""
    # Remove newlines and multiple spaces
    cleaned_text = ' '.join(text.split())
    return cleaned_text

def convert_pptx_to_json(input_file, file_name):
    slides_data = []
    prs = Presentation(input_file)
    
    for idx, slide in enumerate(prs.slides, start=1):
        slide_info = {
            "slide_number": idx,
            "text": "",
            "shapes": []
        }
        
        for shape in slide.shapes:
            shape_data = {"shape_type": shape.shape_type, "text": ""}
            
            # Если объект — текстовый
            if hasattr(shape, "text") and shape.text.strip():
                slide_info["text"] += shape.text + "\n"
                shape_data["text"] = shape.text
            
            # Если объект — изображение, используем OCR
            if shape.shape_type == 13:  # MSO_SHAPE_TYPE.PICTURE = 13
                image_stream = io.BytesIO(shape.image.blob)
                pillow_image = Image.open(image_stream)
                extracted_text = pytesseract.image_to_string(pillow_image, lang='rus+eng')
                slide_info["text"] += extracted_text + "\n"
            
            slide_info["shapes"].append(shape_data)
        
        slides_data.append(slide_info)
    
    return {'file_name': file_name, 'slides': slides_data}

def convert_pdf_to_json(input_file, api_key):
    client = Mistral(api_key=api_key)

    def encode_image(image_obj):
        if isinstance(image_obj, Image.Image):
            img = image_obj
        else:
            img = Image.open(image_obj)
        buffered = io.BytesIO()
        img.save(buffered, format="PNG")
        return base64.b64encode(buffered.getvalue()).decode("utf-8")

    def describe_image(image_path):
        image_base64 = encode_image(image_path)
        prompt = """
                    Please extract all the text from the picture. 
                    Don't add anything from yourself. If there is no any text, give me an empty answer.
                    """
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
                ]
            }
        ]
        response = client.chat.complete(
            model="pixtral-large-latest",
            messages=messages,
            max_tokens=600
        )
        return response.choices[0].message.content

    results = []  
    with tempfile.TemporaryDirectory() as temp_dir:
        images = convert_from_path(input_file)
        for i, image in enumerate(images):
            image_path = os.path.join(temp_dir, f'page_{i + 1}.png')
            image.save(image_path, 'PNG')
            try:
                description = describe_image(image_path)
                if description:  
                    results.append(description)
            except Exception as e:
                print(f"Ошибка обработки {image_path}: {e}")

    return {"file_name": os.path.basename(input_file), "text": results}

def convert_docx_to_json(input_file, file_name):
    paragraphs_data = []
    doc = Document(input_file)
    for para_idx, paragraph in enumerate(doc.paragraphs, start=1):
        paragraph_info = {
            "paragraph_number": para_idx,
            "text": paragraph.text
        }
        paragraphs_data.append(paragraph_info)

    return {"file_name": file_name, "paragraphs": paragraphs_data}