Update summary/views.py
Browse files- summary/views.py +40 -33
summary/views.py
CHANGED
|
@@ -13,6 +13,7 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
|
| 13 |
import torch
|
| 14 |
from deep_translator import GoogleTranslator
|
| 15 |
|
|
|
|
| 16 |
# --- НАСТРОЙКИ ОКРУЖЕНИЯ ---
|
| 17 |
os.environ['TRANSFORMERS_CACHE'] = '/tmp/.cache'
|
| 18 |
os.environ['HF_HOME'] = '/tmp/.cache'
|
|
@@ -21,13 +22,16 @@ os.environ['FLAGS_enable_pir_api'] = '0'
|
|
| 21 |
# Отключаем проверку обновлений для Paddle, чтобы он не висел
|
| 22 |
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
|
| 23 |
|
|
|
|
| 24 |
logger = logging.getLogger(__name__)
|
| 25 |
|
|
|
|
| 26 |
# Глобальные переменные (ОБЯЗАТЕЛЬНО ВСЕ ТРИ)
|
| 27 |
_summarizer_model = None
|
| 28 |
_summarizer_tokenizer = None
|
| 29 |
_ocr_model = None
|
| 30 |
-
_tool = None
|
|
|
|
| 31 |
|
| 32 |
def get_summarizer():
|
| 33 |
global _summarizer_model, _summarizer_tokenizer
|
|
@@ -39,6 +43,7 @@ def get_summarizer():
|
|
| 39 |
print("✅ Модель Bart загружена")
|
| 40 |
return _summarizer_model, _summarizer_tokenizer
|
| 41 |
|
|
|
|
| 42 |
def get_tool():
|
| 43 |
global _tool
|
| 44 |
if _tool is None:
|
|
@@ -46,16 +51,13 @@ def get_tool():
|
|
| 46 |
_tool = language_tool_python.LanguageTool('ru-RU')
|
| 47 |
return _tool
|
| 48 |
|
|
|
|
| 49 |
def get_ocr():
|
| 50 |
global _ocr_model
|
| 51 |
if _ocr_model is None:
|
|
|
|
| 52 |
from paddleocr import PaddleOCR
|
| 53 |
-
|
| 54 |
-
_ocr_model = PaddleOCR(
|
| 55 |
-
use_textline_orientation=True,
|
| 56 |
-
lang='ru',
|
| 57 |
-
show_log=False
|
| 58 |
-
)
|
| 59 |
return _ocr_model
|
| 60 |
|
| 61 |
|
|
@@ -69,6 +71,7 @@ def get_text_diff(old_text, new_text):
|
|
| 69 |
result.append(f'<span style="background:#ffcccb; color:#b31d28; border-radius:3px; padding:0 2px;">{old_text[i1:i2]}</span>')
|
| 70 |
return "".join(result)
|
| 71 |
|
|
|
|
| 72 |
def limit_image_size(image, max_width=1920, max_height=1080):
|
| 73 |
h, w = image.shape[:2]
|
| 74 |
if h > max_height or w > max_width:
|
|
@@ -76,27 +79,30 @@ def limit_image_size(image, max_width=1920, max_height=1080):
|
|
| 76 |
return cv2.resize(image, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_LANCZOS4)
|
| 77 |
return image
|
| 78 |
|
|
|
|
| 79 |
def index(request):
|
| 80 |
return render(request, 'index.html')
|
| 81 |
|
|
|
|
| 82 |
@csrf_exempt
|
| 83 |
@require_POST
|
| 84 |
def summarize(request):
|
| 85 |
text = request.POST.get('text', '')
|
| 86 |
-
if not text.strip():
|
|
|
|
| 87 |
try:
|
| 88 |
model, tokenizer = get_summarizer()
|
| 89 |
orig_words_count = len(text.split())
|
| 90 |
-
|
| 91 |
is_russian = any('\u0400' <= c <= '\u04FF' for c in text)
|
| 92 |
text_for_ai = GoogleTranslator(source='auto', target='en').translate(text[:2000]) if is_russian else text[:2000]
|
| 93 |
|
| 94 |
inputs = tokenizer(text_for_ai, return_tensors="pt", max_length=1024, truncation=True)
|
| 95 |
summary_ids = model.generate(inputs["input_ids"], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
|
| 96 |
-
summary_txt = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
| 97 |
|
| 98 |
final = GoogleTranslator(source='en', target='ru').translate(summary_txt) if is_russian else summary_txt
|
| 99 |
-
|
| 100 |
return JsonResponse({
|
| 101 |
'summary': final,
|
| 102 |
'orig_words': orig_words_count,
|
|
@@ -105,23 +111,27 @@ def summarize(request):
|
|
| 105 |
except Exception as e:
|
| 106 |
return JsonResponse({'error': f"Ошибка суммаризации: {str(e)}"})
|
| 107 |
|
|
|
|
| 108 |
@csrf_exempt
|
| 109 |
@require_POST
|
| 110 |
def translate(request):
|
| 111 |
text = request.POST.get('text', '')
|
| 112 |
to_lang = request.POST.get('to', 'en')
|
| 113 |
-
if not text.strip():
|
|
|
|
| 114 |
try:
|
| 115 |
res = GoogleTranslator(source='auto', target=to_lang).translate(text[:5000])
|
| 116 |
return JsonResponse({'translation': res})
|
| 117 |
except Exception as e:
|
| 118 |
return JsonResponse({'error': str(e)})
|
| 119 |
|
|
|
|
| 120 |
@csrf_exempt
|
| 121 |
@require_POST
|
| 122 |
def spellcheck(request):
|
| 123 |
text = request.POST.get('text', '')
|
| 124 |
-
if not text.strip():
|
|
|
|
| 125 |
try:
|
| 126 |
from pyaspeller import YandexSpeller
|
| 127 |
speller = YandexSpeller()
|
|
@@ -129,40 +139,37 @@ def spellcheck(request):
|
|
| 129 |
tool_instance = get_tool()
|
| 130 |
if tool_instance:
|
| 131 |
matches = tool_instance.check(fixed_text)
|
| 132 |
-
if matches:
|
|
|
|
| 133 |
diff_html = get_text_diff(text, fixed_text)
|
| 134 |
return JsonResponse({'spelled_text': fixed_text, 'diff_html': diff_html, 'is_perfect': text == fixed_text})
|
| 135 |
except Exception as e:
|
| 136 |
return JsonResponse({'error': str(e)})
|
| 137 |
|
|
|
|
| 138 |
@csrf_exempt
|
| 139 |
def ocr_process(request):
|
| 140 |
-
if request.method != 'POST':
|
|
|
|
| 141 |
image_file = request.FILES.get('image')
|
| 142 |
-
if not image_file:
|
| 143 |
-
|
| 144 |
try:
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
img = cv2.imdecode(
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
return JsonResponse({'error': 'Не удалось прочитать изображение'})
|
| 151 |
-
|
| 152 |
-
# --- OCR EXECUTION ---
|
| 153 |
ocr = get_ocr()
|
| 154 |
result = ocr.ocr(img)
|
| 155 |
|
| 156 |
-
|
| 157 |
-
extracted_text = []
|
| 158 |
if result and result[0]:
|
| 159 |
for line in result[0]:
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
clean_text = " ".join(
|
| 164 |
return JsonResponse({'text': clean_text})
|
| 165 |
-
|
| 166 |
except Exception as e:
|
| 167 |
return JsonResponse({'error': f'Ошибка OCR: {str(e)}'})
|
| 168 |
-
|
|
|
|
| 13 |
import torch
|
| 14 |
from deep_translator import GoogleTranslator
|
| 15 |
|
| 16 |
+
|
| 17 |
# --- НАСТРОЙКИ ОКРУЖЕНИЯ ---
|
| 18 |
os.environ['TRANSFORMERS_CACHE'] = '/tmp/.cache'
|
| 19 |
os.environ['HF_HOME'] = '/tmp/.cache'
|
|
|
|
| 22 |
# Отключаем проверку обновлений для Paddle, чтобы он не висел
|
| 23 |
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
|
| 24 |
|
| 25 |
+
|
| 26 |
logger = logging.getLogger(__name__)
|
| 27 |
|
| 28 |
+
|
| 29 |
# Глобальные переменные (ОБЯЗАТЕЛЬНО ВСЕ ТРИ)
|
| 30 |
_summarizer_model = None
|
| 31 |
_summarizer_tokenizer = None
|
| 32 |
_ocr_model = None
|
| 33 |
+
_tool = None # Теперь точно объявлена
|
| 34 |
+
|
| 35 |
|
| 36 |
def get_summarizer():
|
| 37 |
global _summarizer_model, _summarizer_tokenizer
|
|
|
|
| 43 |
print("✅ Модель Bart загружена")
|
| 44 |
return _summarizer_model, _summarizer_tokenizer
|
| 45 |
|
| 46 |
+
|
| 47 |
def get_tool():
|
| 48 |
global _tool
|
| 49 |
if _tool is None:
|
|
|
|
| 51 |
_tool = language_tool_python.LanguageTool('ru-RU')
|
| 52 |
return _tool
|
| 53 |
|
| 54 |
+
|
| 55 |
def get_ocr():
|
| 56 |
global _ocr_model
|
| 57 |
if _ocr_model is None:
|
| 58 |
+
print("🚀 Загрузка PaddleOCR...")
|
| 59 |
from paddleocr import PaddleOCR
|
| 60 |
+
_ocr_model = PaddleOCR(use_angle_cls=True, lang='ru', enable_mkldnn=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
return _ocr_model
|
| 62 |
|
| 63 |
|
|
|
|
| 71 |
result.append(f'<span style="background:#ffcccb; color:#b31d28; border-radius:3px; padding:0 2px;">{old_text[i1:i2]}</span>')
|
| 72 |
return "".join(result)
|
| 73 |
|
| 74 |
+
|
| 75 |
def limit_image_size(image, max_width=1920, max_height=1080):
|
| 76 |
h, w = image.shape[:2]
|
| 77 |
if h > max_height or w > max_width:
|
|
|
|
| 79 |
return cv2.resize(image, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_LANCZOS4)
|
| 80 |
return image
|
| 81 |
|
| 82 |
+
|
| 83 |
def index(request):
|
| 84 |
return render(request, 'index.html')
|
| 85 |
|
| 86 |
+
|
| 87 |
@csrf_exempt
|
| 88 |
@require_POST
|
| 89 |
def summarize(request):
|
| 90 |
text = request.POST.get('text', '')
|
| 91 |
+
if not text.strip():
|
| 92 |
+
return JsonResponse({'error': 'Текст пуст'})
|
| 93 |
try:
|
| 94 |
model, tokenizer = get_summarizer()
|
| 95 |
orig_words_count = len(text.split())
|
| 96 |
+
|
| 97 |
is_russian = any('\u0400' <= c <= '\u04FF' for c in text)
|
| 98 |
text_for_ai = GoogleTranslator(source='auto', target='en').translate(text[:2000]) if is_russian else text[:2000]
|
| 99 |
|
| 100 |
inputs = tokenizer(text_for_ai, return_tensors="pt", max_length=1024, truncation=True)
|
| 101 |
summary_ids = model.generate(inputs["input_ids"], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
|
| 102 |
+
summary_txt = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
| 103 |
|
| 104 |
final = GoogleTranslator(source='en', target='ru').translate(summary_txt) if is_russian else summary_txt
|
| 105 |
+
|
| 106 |
return JsonResponse({
|
| 107 |
'summary': final,
|
| 108 |
'orig_words': orig_words_count,
|
|
|
|
| 111 |
except Exception as e:
|
| 112 |
return JsonResponse({'error': f"Ошибка суммаризации: {str(e)}"})
|
| 113 |
|
| 114 |
+
|
| 115 |
@csrf_exempt
|
| 116 |
@require_POST
|
| 117 |
def translate(request):
|
| 118 |
text = request.POST.get('text', '')
|
| 119 |
to_lang = request.POST.get('to', 'en')
|
| 120 |
+
if not text.strip():
|
| 121 |
+
return JsonResponse({'error': 'Текст пуст'})
|
| 122 |
try:
|
| 123 |
res = GoogleTranslator(source='auto', target=to_lang).translate(text[:5000])
|
| 124 |
return JsonResponse({'translation': res})
|
| 125 |
except Exception as e:
|
| 126 |
return JsonResponse({'error': str(e)})
|
| 127 |
|
| 128 |
+
|
| 129 |
@csrf_exempt
|
| 130 |
@require_POST
|
| 131 |
def spellcheck(request):
|
| 132 |
text = request.POST.get('text', '')
|
| 133 |
+
if not text.strip():
|
| 134 |
+
return JsonResponse({'error': 'Введите текст'})
|
| 135 |
try:
|
| 136 |
from pyaspeller import YandexSpeller
|
| 137 |
speller = YandexSpeller()
|
|
|
|
| 139 |
tool_instance = get_tool()
|
| 140 |
if tool_instance:
|
| 141 |
matches = tool_instance.check(fixed_text)
|
| 142 |
+
if matches:
|
| 143 |
+
fixed_text = tool_instance.correct(fixed_text)
|
| 144 |
diff_html = get_text_diff(text, fixed_text)
|
| 145 |
return JsonResponse({'spelled_text': fixed_text, 'diff_html': diff_html, 'is_perfect': text == fixed_text})
|
| 146 |
except Exception as e:
|
| 147 |
return JsonResponse({'error': str(e)})
|
| 148 |
|
| 149 |
+
|
| 150 |
@csrf_exempt
|
| 151 |
def ocr_process(request):
|
| 152 |
+
if request.method != 'POST':
|
| 153 |
+
return JsonResponse({'error': 'Метод не поддерживается'})
|
| 154 |
image_file = request.FILES.get('image')
|
| 155 |
+
if not image_file:
|
| 156 |
+
return JsonResponse({'error': 'Файл не найден'})
|
| 157 |
try:
|
| 158 |
+
file_bytes = image_file.read()
|
| 159 |
+
nparr = np.frombuffer(file_bytes, np.uint8)
|
| 160 |
+
img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
|
| 161 |
+
img = limit_image_size(img)
|
| 162 |
+
|
|
|
|
|
|
|
|
|
|
| 163 |
ocr = get_ocr()
|
| 164 |
result = ocr.ocr(img)
|
| 165 |
|
| 166 |
+
final_text_blocks = []
|
|
|
|
| 167 |
if result and result[0]:
|
| 168 |
for line in result[0]:
|
| 169 |
+
final_text_blocks.append(line[1][0])
|
| 170 |
+
|
| 171 |
+
import re
|
| 172 |
+
clean_text = re.sub(r'\s+([,.!?;:])', r'\1', " ".join(final_text_blocks))
|
| 173 |
return JsonResponse({'text': clean_text})
|
|
|
|
| 174 |
except Exception as e:
|
| 175 |
return JsonResponse({'error': f'Ошибка OCR: {str(e)}'})
|
|
|