| import os |
| import logging |
| import difflib |
| import cv2 |
| import numpy as np |
| import language_tool_python |
| from PIL import Image, ImageEnhance, ImageOps |
| from django.shortcuts import render |
| from django.http import JsonResponse |
| from django.views.decorators.csrf import csrf_exempt |
| from django.views.decorators.http import require_POST |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
| import torch |
| from deep_translator import GoogleTranslator |
|
|
|
|
| |
| os.environ['TRANSFORMERS_CACHE'] = '/tmp/.cache' |
| os.environ['HF_HOME'] = '/tmp/.cache' |
| os.environ['PADDLE_HOME'] = '/tmp/.paddle' |
| os.environ['FLAGS_enable_pir_api'] = '0' |
| |
| os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True' |
|
|
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| |
| _summarizer_model = None |
| _summarizer_tokenizer = None |
| _ocr_model = None |
| _tool = None |
|
|
|
|
| def get_summarizer(): |
| global _summarizer_model, _summarizer_tokenizer |
| if _summarizer_model is None: |
| print("🚀 Загрузка Bart напрямую...") |
| model_name = "sshleifer/distilbart-cnn-6-6" |
| _summarizer_tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="/tmp/.cache") |
| _summarizer_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir="/tmp/.cache") |
| print("✅ Модель Bart загружена") |
| return _summarizer_model, _summarizer_tokenizer |
|
|
|
|
| def get_tool(): |
| global _tool |
| if _tool is None: |
| print("🚀 Загрузка LanguageTool...") |
| _tool = language_tool_python.LanguageTool('ru-RU') |
| return _tool |
|
|
|
|
| def get_ocr(): |
| global _ocr_model |
| if _ocr_model is None: |
| print("🚀 Загрузка PaddleOCR...") |
| from paddleocr import PaddleOCR |
| _ocr_model = PaddleOCR(use_angle_cls=True, lang='ru', enable_mkldnn=False) |
| return _ocr_model |
|
|
|
|
| def get_text_diff(old_text, new_text): |
| result = [] |
| matcher = difflib.SequenceMatcher(None, old_text, new_text) |
| for tag, i1, i2, j1, j2 in matcher.get_opcodes(): |
| if tag == 'equal': |
| result.append(old_text[i1:i2]) |
| elif tag in ('replace', 'delete'): |
| result.append(f'<span style="background:#ffcccb; color:#b31d28; border-radius:3px; padding:0 2px;">{old_text[i1:i2]}</span>') |
| return "".join(result) |
|
|
|
|
| def limit_image_size(image, max_width=1920, max_height=1080): |
| h, w = image.shape[:2] |
| if h > max_height or w > max_width: |
| scale = min(max_width / w, max_height / h) |
| return cv2.resize(image, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_LANCZOS4) |
| return image |
|
|
|
|
| def index(request): |
| return render(request, 'index.html') |
|
|
|
|
| @csrf_exempt |
| @require_POST |
| def summarize(request): |
| text = request.POST.get('text', '') |
| if not text.strip(): |
| return JsonResponse({'error': 'Текст пуст'}) |
| try: |
| model, tokenizer = get_summarizer() |
| orig_words_count = len(text.split()) |
|
|
| is_russian = any('\u0400' <= c <= '\u04FF' for c in text) |
| text_for_ai = GoogleTranslator(source='auto', target='en').translate(text[:2000]) if is_russian else text[:2000] |
|
|
| inputs = tokenizer(text_for_ai, return_tensors="pt", max_length=1024, truncation=True) |
| summary_ids = model.generate(inputs["input_ids"], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True) |
| summary_txt = tokenizer.decode(summary_ids[0], skip_special_tokens=True) |
|
|
| final = GoogleTranslator(source='en', target='ru').translate(summary_txt) if is_russian else summary_txt |
|
|
| return JsonResponse({ |
| 'summary': final, |
| 'orig_words': orig_words_count, |
| 'summary_words': len(final.split()) |
| }) |
| except Exception as e: |
| return JsonResponse({'error': f"Ошибка суммаризации: {str(e)}"}) |
|
|
|
|
| @csrf_exempt |
| @require_POST |
| def translate(request): |
| text = request.POST.get('text', '') |
| to_lang = request.POST.get('to', 'en') |
| if not text.strip(): |
| return JsonResponse({'error': 'Текст пуст'}) |
| try: |
| res = GoogleTranslator(source='auto', target=to_lang).translate(text[:5000]) |
| return JsonResponse({'translation': res}) |
| except Exception as e: |
| return JsonResponse({'error': str(e)}) |
|
|
|
|
| @csrf_exempt |
| @require_POST |
| def spellcheck(request): |
| text = request.POST.get('text', '') |
| if not text.strip(): |
| return JsonResponse({'error': 'Введите текст'}) |
| try: |
| from pyaspeller import YandexSpeller |
| speller = YandexSpeller() |
| fixed_text = speller.spelled(text) |
| tool_instance = get_tool() |
| if tool_instance: |
| matches = tool_instance.check(fixed_text) |
| if matches: |
| fixed_text = tool_instance.correct(fixed_text) |
| diff_html = get_text_diff(text, fixed_text) |
| return JsonResponse({'spelled_text': fixed_text, 'diff_html': diff_html, 'is_perfect': text == fixed_text}) |
| except Exception as e: |
| return JsonResponse({'error': str(e)}) |
|
|
|
|
| @csrf_exempt |
| def ocr_process(request): |
| if request.method != 'POST': |
| return JsonResponse({'error': 'Метод не поддерживается'}) |
| image_file = request.FILES.get('image') |
| if not image_file: |
| return JsonResponse({'error': 'Файл не найден'}) |
| try: |
| file_bytes = image_file.read() |
| nparr = np.frombuffer(file_bytes, np.uint8) |
| img = cv2.imdecode(nparr, cv2.IMREAD_COLOR) |
| img = limit_image_size(img) |
|
|
| ocr = get_ocr() |
| result = ocr.ocr(img) |
|
|
| final_text_blocks = [] |
| if result and result[0]: |
| for line in result[0]: |
| final_text_blocks.append(line[1][0]) |
|
|
| import re |
| clean_text = re.sub(r'\s+([,.!?;:])', r'\1', " ".join(final_text_blocks)) |
| return JsonResponse({'text': clean_text}) |
| except Exception as e: |
| return JsonResponse({'error': f'Ошибка OCR: {str(e)}'}) |
|
|