Spaces:

Ane4ka
/

NoteMaker

Sleeping

NoteMaker / app.py

ASureevaA

edit

529a697 about 2 months ago

6.54 kB

	from typing import Tuple, Optional

	import tempfile

	import numpy as numpy_module
	import soundfile as soundfile_module
	import torch
	import gradio as gradio_module
	from PIL import Image
	import easyocr
	from transformers import (
	pipeline,
	VitsModel,
	AutoTokenizer,
	)

	device_string: str = "cpu"

	ocr_reader = easyocr.Reader(
	["en"],
	gpu=False,
	)


	def run_ocr(image_object: Image.Image) -> str:
	"""
	OCR для печатного английского текста.
	"""
	if image_object is None:
	return ""

	rgb_image_object: Image.Image = image_object.convert("RGB")
	numpy_image = numpy_module.array(rgb_image_object)

	ocr_results = ocr_reader.readtext(
	numpy_image,
	detail=0,
	paragraph=True,
	)

	text_parts = [str(text_value) for text_value in ocr_results if text_value]

	recognized_text: str = "\n".join(text_parts).strip()
	return recognized_text

	text_classifier_pipeline = pipeline(
	task="text-classification",
	model="distilbert-base-uncased-finetuned-sst-2-english",
	)


	def run_text_classification(input_text: str) -> str:
	"""
	Анализ текста трансформером.
	"""
	cleaned_text: str = input_text.strip()
	if not cleaned_text:
	return ""

	classifier_result_list = text_classifier_pipeline(
	cleaned_text,
	truncation=True,
	max_length=512,
	)
	classifier_result = classifier_result_list[0]

	label_value: str = str(classifier_result.get("label", ""))
	score_value: float = float(classifier_result.get("score", 0.0))

	classification_text: str = f"{label_value} (score={score_value:.3f})"
	return classification_text


	summary_pipeline = pipeline(
	task="summarization",
	model="sshleifer/distilbart-cnn-12-6",
	)


	def run_summarization(
	input_text: str,
	max_summary_tokens: int = 128,
	) -> str:
	"""
	Английская суммаризация.
	"""
	cleaned_text: str = input_text.strip()
	if not cleaned_text:
	return ""

	word_count: int = len(cleaned_text.split())
	dynamic_max_length: int = min(
	max_summary_tokens,
	max(32, word_count + 20),
	)

	if word_count < 8:
	return cleaned_text

	summary_result_list = summary_pipeline(
	cleaned_text,
	max_length=dynamic_max_length,
	min_length=max(10, dynamic_max_length // 3),
	do_sample=False,
	)

	summary_text: str = summary_result_list[0]["summary_text"].strip()
	return summary_text


	tts_model: VitsModel = VitsModel.from_pretrained("facebook/mms-tts-eng")
	tts_tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
	tts_model.to(device_string)


	def run_tts(summary_text: str) -> Optional[str]:
	"""
	Озвучка английского текста конспекта через VitsModel (facebook/mms-tts-eng).
	"""
	cleaned_text: str = summary_text.strip()
	if not cleaned_text:
	return None

	tokenized_inputs = tts_tokenizer(
	cleaned_text,
	return_tensors="pt",
	)
	tokenized_inputs = {
	key: value.to(device_string)
	for key, value in tokenized_inputs.items()
	}

	input_ids_tensor = tokenized_inputs.get("input_ids")
	if input_ids_tensor is None or input_ids_tensor.numel() == 0:
	return None

	try:
	with torch.no_grad():
	model_output = tts_model(**tokenized_inputs)
	waveform_tensor = model_output.waveform # (batch, n_samples)
	except RuntimeError as runtime_error:
	print(f"[WARN] TTS RuntimeError: {runtime_error}")
	return None

	waveform_array = waveform_tensor.squeeze().cpu().numpy().astype("float32")
	waveform_array = numpy_module.clip(waveform_array, -1.0, 1.0)

	with tempfile.NamedTemporaryFile(
	suffix=".wav",
	delete=False,
	) as temporary_file:
	soundfile_module.write(
	temporary_file.name,
	waveform_array,
	tts_model.config.sampling_rate,
	)
	file_path: str = temporary_file.name

	return file_path


	def full_flow(
	image_object: Image.Image,
	max_summary_tokens: int = 128,
	) -> Tuple[str, str, str, Optional[str]]:
	"""
	1) OCR
	2) Классификация текста
	3) Суммаризация
	4) TTS
	"""
	recognized_text: str = run_ocr(image_object=image_object)

	classification_text: str = run_text_classification(recognized_text)

	summary_text: str = run_summarization(
	input_text=recognized_text,
	max_summary_tokens=max_summary_tokens,
	)

	audio_file_path: Optional[str] = run_tts(summary_text=summary_text)

	return recognized_text, classification_text, summary_text, audio_file_path


	gradio_interface = gradio_module.Interface(
	fn=full_flow,
	inputs=[
	gradio_module.Image(
	type="pil",
	label="Изображение с напечатанным английским текстом",
	),
	gradio_module.Slider(
	minimum=32,
	maximum=256,
	value=128,
	step=16,
	label="Максимальная длина конспекта (токены, примерно)",
	),
	],
	outputs=[
	gradio_module.Textbox(
	label="Распознанный текст (OCR, easyocr)",
	lines=8,
	),
	gradio_module.Textbox(
	label="Анализ текста (классификация, DistilBERT)",
	lines=2,
	),
	gradio_module.Textbox(
	label="Конспект (английский текст, DistilBART)",
	lines=6,
	),
	gradio_module.Audio(
	label="Озвучка конспекта (английский TTS, VITS)",
	type="filepath",
	),
	],
	title="Картинка → Текст → Анализ → Конспект → Озвучка",
	description=(
	"1) easyocr распознаёт печатный английский текст с картинки.\n"
	"2) Трансформер-классификатор (DistilBERT) оценивает тон текста.\n"
	"3) Трансформер-суммаризатор (DistilBART) делает краткий конспект.\n"
	"4) Трансформер TTS (MMS VITS) озвучивает конспект."
	),
	)


	if __name__ == "__main__":
	gradio_interface.launch()