Spaces:

Valerianikooooo
/

demo_step1

Runtime error

App Files Files Community

demo_step1 / utils.py

Valerianikooooo

Upload utils.py

ba9fce6 verified about 1 year ago

raw

history blame contribute delete

3.96 kB

	from pptx import Presentation
	import fitz # PyMuPDF
	from PIL import Image
	import pytesseract
	import json
	import io
	import os
	from docx import Document
	import tempfile
	from pdf2image import convert_from_path
	from mistralai import Mistral
	from io import BytesIO
	import base64

	def preprocess_text(text):
	"""Preprocess the extracted text to remove unwanted characters and excess whitespace."""
	# Remove newlines and multiple spaces
	cleaned_text = ' '.join(text.split())
	return cleaned_text

	def convert_pptx_to_json(input_file, file_name):
	slides_data = []
	prs = Presentation(input_file)

	for idx, slide in enumerate(prs.slides, start=1):
	slide_info = {
	"slide_number": idx,
	"text": "",
	"shapes": []
	}

	for shape in slide.shapes:
	shape_data = {"shape_type": shape.shape_type, "text": ""}

	# Если объект — текстовый
	if hasattr(shape, "text") and shape.text.strip():
	slide_info["text"] += shape.text + "\n"
	shape_data["text"] = shape.text

	# Если объект — изображение, используем OCR
	if shape.shape_type == 13: # MSO_SHAPE_TYPE.PICTURE = 13
	image_stream = io.BytesIO(shape.image.blob)
	pillow_image = Image.open(image_stream)
	extracted_text = pytesseract.image_to_string(pillow_image, lang='rus+eng')
	slide_info["text"] += extracted_text + "\n"

	slide_info["shapes"].append(shape_data)

	slides_data.append(slide_info)

	return {'file_name': file_name, 'slides': slides_data}

	def convert_pdf_to_json(input_file, api_key):
	client = Mistral(api_key=api_key)

	def encode_image(image_obj):
	if isinstance(image_obj, Image.Image):
	img = image_obj
	else:
	img = Image.open(image_obj)
	buffered = io.BytesIO()
	img.save(buffered, format="PNG")
	return base64.b64encode(buffered.getvalue()).decode("utf-8")

	def describe_image(image_path):
	image_base64 = encode_image(image_path)
	prompt = """
	Please extract all the text from the picture.
	Don't add anything from yourself. If there is no any text, give me an empty answer.
	"""
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": prompt},
	{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
	]
	}
	]
	response = client.chat.complete(
	model="pixtral-large-latest",
	messages=messages,
	max_tokens=600
	)
	return response.choices[0].message.content

	results = []
	with tempfile.TemporaryDirectory() as temp_dir:
	images = convert_from_path(input_file)
	for i, image in enumerate(images):
	image_path = os.path.join(temp_dir, f'page_{i + 1}.png')
	image.save(image_path, 'PNG')
	try:
	description = describe_image(image_path)
	if description:
	results.append(description)
	except Exception as e:
	print(f"Ошибка обработки {image_path}: {e}")

	return {"file_name": os.path.basename(input_file), "text": results}

	def convert_docx_to_json(input_file, file_name):
	paragraphs_data = []
	doc = Document(input_file)
	for para_idx, paragraph in enumerate(doc.paragraphs, start=1):
	paragraph_info = {
	"paragraph_number": para_idx,
	"text": paragraph.text
	}
	paragraphs_data.append(paragraph_info)

	return {"file_name": file_name, "paragraphs": paragraphs_data}