itemaz commited on
Commit
9095d2c
·
verified ·
1 Parent(s): ec9b12f

Update summary/views.py

Browse files
Files changed (1) hide show
  1. summary/views.py +40 -33
summary/views.py CHANGED
@@ -13,6 +13,7 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
13
  import torch
14
  from deep_translator import GoogleTranslator
15
 
 
16
  # --- НАСТРОЙКИ ОКРУЖЕНИЯ ---
17
  os.environ['TRANSFORMERS_CACHE'] = '/tmp/.cache'
18
  os.environ['HF_HOME'] = '/tmp/.cache'
@@ -21,13 +22,16 @@ os.environ['FLAGS_enable_pir_api'] = '0'
21
  # Отключаем проверку обновлений для Paddle, чтобы он не висел
22
  os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
23
 
 
24
  logger = logging.getLogger(__name__)
25
 
 
26
  # Глобальные переменные (ОБЯЗАТЕЛЬНО ВСЕ ТРИ)
27
  _summarizer_model = None
28
  _summarizer_tokenizer = None
29
  _ocr_model = None
30
- _tool = None # Теперь точно объявлена
 
31
 
32
  def get_summarizer():
33
  global _summarizer_model, _summarizer_tokenizer
@@ -39,6 +43,7 @@ def get_summarizer():
39
  print("✅ Модель Bart загружена")
40
  return _summarizer_model, _summarizer_tokenizer
41
 
 
42
  def get_tool():
43
  global _tool
44
  if _tool is None:
@@ -46,16 +51,13 @@ def get_tool():
46
  _tool = language_tool_python.LanguageTool('ru-RU')
47
  return _tool
48
 
 
49
  def get_ocr():
50
  global _ocr_model
51
  if _ocr_model is None:
 
52
  from paddleocr import PaddleOCR
53
- # Убрали use_gpu (автоопределение) и заменили устаревший use_angle_cls
54
- _ocr_model = PaddleOCR(
55
- use_textline_orientation=True,
56
- lang='ru',
57
- show_log=False
58
- )
59
  return _ocr_model
60
 
61
 
@@ -69,6 +71,7 @@ def get_text_diff(old_text, new_text):
69
  result.append(f'<span style="background:#ffcccb; color:#b31d28; border-radius:3px; padding:0 2px;">{old_text[i1:i2]}</span>')
70
  return "".join(result)
71
 
 
72
  def limit_image_size(image, max_width=1920, max_height=1080):
73
  h, w = image.shape[:2]
74
  if h > max_height or w > max_width:
@@ -76,27 +79,30 @@ def limit_image_size(image, max_width=1920, max_height=1080):
76
  return cv2.resize(image, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_LANCZOS4)
77
  return image
78
 
 
79
  def index(request):
80
  return render(request, 'index.html')
81
 
 
82
  @csrf_exempt
83
  @require_POST
84
  def summarize(request):
85
  text = request.POST.get('text', '')
86
- if not text.strip(): return JsonResponse({'error': 'Текст пуст'})
 
87
  try:
88
  model, tokenizer = get_summarizer()
89
  orig_words_count = len(text.split())
90
-
91
  is_russian = any('\u0400' <= c <= '\u04FF' for c in text)
92
  text_for_ai = GoogleTranslator(source='auto', target='en').translate(text[:2000]) if is_russian else text[:2000]
93
 
94
  inputs = tokenizer(text_for_ai, return_tensors="pt", max_length=1024, truncation=True)
95
  summary_ids = model.generate(inputs["input_ids"], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
96
- summary_txt = tokenizer.decode(summary_ids[0], skip_special_tokens=True) # Добавил [0]
97
 
98
  final = GoogleTranslator(source='en', target='ru').translate(summary_txt) if is_russian else summary_txt
99
-
100
  return JsonResponse({
101
  'summary': final,
102
  'orig_words': orig_words_count,
@@ -105,23 +111,27 @@ def summarize(request):
105
  except Exception as e:
106
  return JsonResponse({'error': f"Ошибка суммаризации: {str(e)}"})
107
 
 
108
  @csrf_exempt
109
  @require_POST
110
  def translate(request):
111
  text = request.POST.get('text', '')
112
  to_lang = request.POST.get('to', 'en')
113
- if not text.strip(): return JsonResponse({'error': 'Текст пуст'})
 
114
  try:
115
  res = GoogleTranslator(source='auto', target=to_lang).translate(text[:5000])
116
  return JsonResponse({'translation': res})
117
  except Exception as e:
118
  return JsonResponse({'error': str(e)})
119
 
 
120
  @csrf_exempt
121
  @require_POST
122
  def spellcheck(request):
123
  text = request.POST.get('text', '')
124
- if not text.strip(): return JsonResponse({'error': 'Введите текст'})
 
125
  try:
126
  from pyaspeller import YandexSpeller
127
  speller = YandexSpeller()
@@ -129,40 +139,37 @@ def spellcheck(request):
129
  tool_instance = get_tool()
130
  if tool_instance:
131
  matches = tool_instance.check(fixed_text)
132
- if matches: fixed_text = tool_instance.correct(fixed_text)
 
133
  diff_html = get_text_diff(text, fixed_text)
134
  return JsonResponse({'spelled_text': fixed_text, 'diff_html': diff_html, 'is_perfect': text == fixed_text})
135
  except Exception as e:
136
  return JsonResponse({'error': str(e)})
137
 
 
138
  @csrf_exempt
139
  def ocr_process(request):
140
- if request.method != 'POST': return JsonResponse({'error': 'Метод не поддерживается'})
 
141
  image_file = request.FILES.get('image')
142
- if not image_file: return JsonResponse({'error': 'Файл не найден'})
143
-
144
  try:
145
- # --- MISSING PART: Decode the image file into a CV2 matrix ---
146
- file_bytes = np.frombuffer(image_file.read(), np.uint8)
147
- img = cv2.imdecode(file_bytes, cv2.IMREAD_COLOR)
148
-
149
- if img is None:
150
- return JsonResponse({'error': 'Не удалось прочитать изображение'})
151
-
152
- # --- OCR EXECUTION ---
153
  ocr = get_ocr()
154
  result = ocr.ocr(img)
155
 
156
- # --- EXTRACT TEXT (PaddleOCR returns nested lists) ---
157
- extracted_text = []
158
  if result and result[0]:
159
  for line in result[0]:
160
- text_val = line[1][0] # Index 1 is the (text, score) tuple, Index 0 is the text
161
- extracted_text.append(text_val)
162
-
163
- clean_text = " ".join(extracted_text)
164
  return JsonResponse({'text': clean_text})
165
-
166
  except Exception as e:
167
  return JsonResponse({'error': f'Ошибка OCR: {str(e)}'})
168
-
 
13
  import torch
14
  from deep_translator import GoogleTranslator
15
 
16
+
17
  # --- НАСТРОЙКИ ОКРУЖЕНИЯ ---
18
  os.environ['TRANSFORMERS_CACHE'] = '/tmp/.cache'
19
  os.environ['HF_HOME'] = '/tmp/.cache'
 
22
  # Отключаем проверку обновлений для Paddle, чтобы он не висел
23
  os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
24
 
25
+
26
  logger = logging.getLogger(__name__)
27
 
28
+
29
  # Глобальные переменные (ОБЯЗАТЕЛЬНО ВСЕ ТРИ)
30
  _summarizer_model = None
31
  _summarizer_tokenizer = None
32
  _ocr_model = None
33
+ _tool = None # Теперь точно объявлена
34
+
35
 
36
  def get_summarizer():
37
  global _summarizer_model, _summarizer_tokenizer
 
43
  print("✅ Модель Bart загружена")
44
  return _summarizer_model, _summarizer_tokenizer
45
 
46
+
47
  def get_tool():
48
  global _tool
49
  if _tool is None:
 
51
  _tool = language_tool_python.LanguageTool('ru-RU')
52
  return _tool
53
 
54
+
55
  def get_ocr():
56
  global _ocr_model
57
  if _ocr_model is None:
58
+ print("🚀 Загрузка PaddleOCR...")
59
  from paddleocr import PaddleOCR
60
+ _ocr_model = PaddleOCR(use_angle_cls=True, lang='ru', enable_mkldnn=False)
 
 
 
 
 
61
  return _ocr_model
62
 
63
 
 
71
  result.append(f'<span style="background:#ffcccb; color:#b31d28; border-radius:3px; padding:0 2px;">{old_text[i1:i2]}</span>')
72
  return "".join(result)
73
 
74
+
75
  def limit_image_size(image, max_width=1920, max_height=1080):
76
  h, w = image.shape[:2]
77
  if h > max_height or w > max_width:
 
79
  return cv2.resize(image, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_LANCZOS4)
80
  return image
81
 
82
+
83
  def index(request):
84
  return render(request, 'index.html')
85
 
86
+
87
  @csrf_exempt
88
  @require_POST
89
  def summarize(request):
90
  text = request.POST.get('text', '')
91
+ if not text.strip():
92
+ return JsonResponse({'error': 'Текст пуст'})
93
  try:
94
  model, tokenizer = get_summarizer()
95
  orig_words_count = len(text.split())
96
+
97
  is_russian = any('\u0400' <= c <= '\u04FF' for c in text)
98
  text_for_ai = GoogleTranslator(source='auto', target='en').translate(text[:2000]) if is_russian else text[:2000]
99
 
100
  inputs = tokenizer(text_for_ai, return_tensors="pt", max_length=1024, truncation=True)
101
  summary_ids = model.generate(inputs["input_ids"], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
102
+ summary_txt = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
103
 
104
  final = GoogleTranslator(source='en', target='ru').translate(summary_txt) if is_russian else summary_txt
105
+
106
  return JsonResponse({
107
  'summary': final,
108
  'orig_words': orig_words_count,
 
111
  except Exception as e:
112
  return JsonResponse({'error': f"Ошибка суммаризации: {str(e)}"})
113
 
114
+
115
  @csrf_exempt
116
  @require_POST
117
  def translate(request):
118
  text = request.POST.get('text', '')
119
  to_lang = request.POST.get('to', 'en')
120
+ if not text.strip():
121
+ return JsonResponse({'error': 'Текст пуст'})
122
  try:
123
  res = GoogleTranslator(source='auto', target=to_lang).translate(text[:5000])
124
  return JsonResponse({'translation': res})
125
  except Exception as e:
126
  return JsonResponse({'error': str(e)})
127
 
128
+
129
  @csrf_exempt
130
  @require_POST
131
  def spellcheck(request):
132
  text = request.POST.get('text', '')
133
+ if not text.strip():
134
+ return JsonResponse({'error': 'Введите текст'})
135
  try:
136
  from pyaspeller import YandexSpeller
137
  speller = YandexSpeller()
 
139
  tool_instance = get_tool()
140
  if tool_instance:
141
  matches = tool_instance.check(fixed_text)
142
+ if matches:
143
+ fixed_text = tool_instance.correct(fixed_text)
144
  diff_html = get_text_diff(text, fixed_text)
145
  return JsonResponse({'spelled_text': fixed_text, 'diff_html': diff_html, 'is_perfect': text == fixed_text})
146
  except Exception as e:
147
  return JsonResponse({'error': str(e)})
148
 
149
+
150
  @csrf_exempt
151
  def ocr_process(request):
152
+ if request.method != 'POST':
153
+ return JsonResponse({'error': 'Метод не поддерживается'})
154
  image_file = request.FILES.get('image')
155
+ if not image_file:
156
+ return JsonResponse({'error': 'Файл не найден'})
157
  try:
158
+ file_bytes = image_file.read()
159
+ nparr = np.frombuffer(file_bytes, np.uint8)
160
+ img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
161
+ img = limit_image_size(img)
162
+
 
 
 
163
  ocr = get_ocr()
164
  result = ocr.ocr(img)
165
 
166
+ final_text_blocks = []
 
167
  if result and result[0]:
168
  for line in result[0]:
169
+ final_text_blocks.append(line[1][0])
170
+
171
+ import re
172
+ clean_text = re.sub(r'\s+([,.!?;:])', r'\1', " ".join(final_text_blocks))
173
  return JsonResponse({'text': clean_text})
 
174
  except Exception as e:
175
  return JsonResponse({'error': f'Ошибка OCR: {str(e)}'})