demo_step1 / utils.py
Valerianikooooo's picture
Upload utils.py
ba9fce6 verified
from pptx import Presentation
import fitz # PyMuPDF
from PIL import Image
import pytesseract
import json
import io
import os
from docx import Document
import tempfile
from pdf2image import convert_from_path
from mistralai import Mistral
from io import BytesIO
import base64
def preprocess_text(text):
"""Preprocess the extracted text to remove unwanted characters and excess whitespace."""
# Remove newlines and multiple spaces
cleaned_text = ' '.join(text.split())
return cleaned_text
def convert_pptx_to_json(input_file, file_name):
slides_data = []
prs = Presentation(input_file)
for idx, slide in enumerate(prs.slides, start=1):
slide_info = {
"slide_number": idx,
"text": "",
"shapes": []
}
for shape in slide.shapes:
shape_data = {"shape_type": shape.shape_type, "text": ""}
# Если объект — текстовый
if hasattr(shape, "text") and shape.text.strip():
slide_info["text"] += shape.text + "\n"
shape_data["text"] = shape.text
# Если объект — изображение, используем OCR
if shape.shape_type == 13: # MSO_SHAPE_TYPE.PICTURE = 13
image_stream = io.BytesIO(shape.image.blob)
pillow_image = Image.open(image_stream)
extracted_text = pytesseract.image_to_string(pillow_image, lang='rus+eng')
slide_info["text"] += extracted_text + "\n"
slide_info["shapes"].append(shape_data)
slides_data.append(slide_info)
return {'file_name': file_name, 'slides': slides_data}
def convert_pdf_to_json(input_file, api_key):
client = Mistral(api_key=api_key)
def encode_image(image_obj):
if isinstance(image_obj, Image.Image):
img = image_obj
else:
img = Image.open(image_obj)
buffered = io.BytesIO()
img.save(buffered, format="PNG")
return base64.b64encode(buffered.getvalue()).decode("utf-8")
def describe_image(image_path):
image_base64 = encode_image(image_path)
prompt = """
Please extract all the text from the picture.
Don't add anything from yourself. If there is no any text, give me an empty answer.
"""
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
]
}
]
response = client.chat.complete(
model="pixtral-large-latest",
messages=messages,
max_tokens=600
)
return response.choices[0].message.content
results = []
with tempfile.TemporaryDirectory() as temp_dir:
images = convert_from_path(input_file)
for i, image in enumerate(images):
image_path = os.path.join(temp_dir, f'page_{i + 1}.png')
image.save(image_path, 'PNG')
try:
description = describe_image(image_path)
if description:
results.append(description)
except Exception as e:
print(f"Ошибка обработки {image_path}: {e}")
return {"file_name": os.path.basename(input_file), "text": results}
def convert_docx_to_json(input_file, file_name):
paragraphs_data = []
doc = Document(input_file)
for para_idx, paragraph in enumerate(doc.paragraphs, start=1):
paragraph_info = {
"paragraph_number": para_idx,
"text": paragraph.text
}
paragraphs_data.append(paragraph_info)
return {"file_name": file_name, "paragraphs": paragraphs_data}