Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -21,7 +21,6 @@ class KDChecker:
|
|
| 21 |
def load_excel_db(self, excel_path):
|
| 22 |
print(f"--- Загрузка Excel: {excel_path} ---")
|
| 23 |
if excel_path is None:
|
| 24 |
-
# Возвращаем: Сообщение, Пустой список авторов, Пустой список шкафов
|
| 25 |
return "Файл не выбран", gr.update(choices=[], value=None), gr.update(choices=[], value=None)
|
| 26 |
|
| 27 |
all_data = []
|
|
@@ -52,7 +51,6 @@ class KDChecker:
|
|
| 52 |
df = pd.read_excel(excel_path, sheet_name=sheet_name, header=header_row_index)
|
| 53 |
df_subset = df.iloc[:, [cab_col_idx, rem_col_idx]]
|
| 54 |
df_subset.columns = ["Cabinet", "Remark"]
|
| 55 |
-
# Добавляем колонку с именем листа (Автором)
|
| 56 |
df_subset["Author"] = sheet_name
|
| 57 |
|
| 58 |
df_subset["Cabinet"] = df_subset["Cabinet"].ffill()
|
|
@@ -67,14 +65,11 @@ class KDChecker:
|
|
| 67 |
return "❌ Ошибка: Не найдены заголовки 'Шкаф' и 'Примечание'.", gr.update(choices=[]), gr.update(choices=[])
|
| 68 |
|
| 69 |
self.excel_db = pd.concat(all_data, ignore_index=True)
|
| 70 |
-
|
| 71 |
-
# Получаем список уникальных авторов (листов)
|
| 72 |
authors_list = sorted(self.excel_db["Author"].unique().tolist())
|
| 73 |
|
| 74 |
print(f"Excel загружен. Всего строк: {len(self.excel_db)}")
|
| 75 |
msg = f"✅ База загружена!\nЗаписей: {len(self.excel_db)}\nАвторы: {', '.join(sheets_log)}"
|
| 76 |
|
| 77 |
-
# Возвращаем: Статус, Список авторов, Очищаем список шкафов
|
| 78 |
return msg, gr.update(choices=authors_list, value=None, interactive=True), gr.update(choices=[], value=None)
|
| 79 |
|
| 80 |
except Exception as e:
|
|
@@ -82,11 +77,8 @@ class KDChecker:
|
|
| 82 |
return f"❌ Ошибка: {e}", gr.update(choices=[]), gr.update(choices=[])
|
| 83 |
|
| 84 |
def get_cabinets_by_author(self, author_name):
|
| 85 |
-
"""Фильтрует шкафы по выбранному автору"""
|
| 86 |
if self.excel_db.empty or not author_name:
|
| 87 |
return gr.update(choices=[], value=None)
|
| 88 |
-
|
| 89 |
-
# Фильтруем базу по автору
|
| 90 |
filtered_cabs = self.excel_db[self.excel_db["Author"] == author_name]["Cabinet"].unique().tolist()
|
| 91 |
return gr.update(choices=sorted(filtered_cabs), value=None, interactive=True)
|
| 92 |
|
|
@@ -94,7 +86,6 @@ class KDChecker:
|
|
| 94 |
text = ""
|
| 95 |
try:
|
| 96 |
with pdfplumber.open(pdf_path) as pdf:
|
| 97 |
-
# Читаем первые 5 страниц для скорости поиска номера
|
| 98 |
for page in pdf.pages[:5]:
|
| 99 |
text += (page.extract_text() or "") + "\n"
|
| 100 |
except Exception as e:
|
|
@@ -102,12 +93,28 @@ class KDChecker:
|
|
| 102 |
return text
|
| 103 |
|
| 104 |
def find_all_decimal_numbers(self, text):
|
| 105 |
-
pattern = r"(РЛТ|ЛДАР|ВНАР|ШТМ)[\s\.]*\d{1}[\s\.]*\d{3}[\s\.]*[А-ЯA-Z]{1,4}[\s\.]*\d{3}(-[\d]+)?"
|
| 106 |
matches = []
|
| 107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
clean_num = match.group(0).replace(" ", "").replace("\n", "")
|
| 109 |
if clean_num not in matches:
|
| 110 |
matches.append(clean_num)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
return matches
|
| 112 |
|
| 113 |
def determine_doc_type(self, filename):
|
|
@@ -179,23 +186,43 @@ class KDChecker:
|
|
| 179 |
found_by_method = "manual"
|
| 180 |
is_manual = True
|
| 181 |
else:
|
| 182 |
-
# === ОПТИМИЗАЦИЯ ===
|
| 183 |
db_clean_keys = set(self.excel_db["Cabinet_Clean"].tolist())
|
| 184 |
|
| 185 |
for file_path in progress.tqdm(files, desc="Поиск номера шкафа"):
|
| 186 |
-
|
| 187 |
-
pdf_numbers = self.find_all_decimal_numbers(text)
|
| 188 |
|
|
|
|
|
|
|
| 189 |
for cand in pdf_numbers:
|
| 190 |
if cand in db_clean_keys:
|
| 191 |
detected_cabinet = cand
|
| 192 |
found_by_method = "number"
|
| 193 |
-
break
|
| 194 |
|
| 195 |
if found_by_method == "number":
|
| 196 |
-
print(f"✅ Шкаф найден
|
| 197 |
break
|
| 198 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
print(f"Определен шкаф: {detected_cabinet}")
|
| 200 |
|
| 201 |
if detected_cabinet == "Не определен":
|
|
@@ -240,7 +267,6 @@ class KDChecker:
|
|
| 240 |
form = c.acroForm
|
| 241 |
width, height = A4
|
| 242 |
|
| 243 |
-
# --- ШРИФТЫ ---
|
| 244 |
font_name = 'Helvetica'
|
| 245 |
local_font = "arial.ttf"
|
| 246 |
|
|
@@ -335,28 +361,18 @@ def create_app():
|
|
| 335 |
gr.Markdown("## ✅ Генератор чек-листов КД")
|
| 336 |
|
| 337 |
with gr.Row():
|
| 338 |
-
# --- ЛЕВАЯ КОЛОНКА: БАЗА ---
|
| 339 |
with gr.Column(scale=1):
|
| 340 |
gr.Markdown("### 1. База знаний")
|
| 341 |
db_in = gr.File(label="Excel (.xlsx)", type="filepath", elem_classes="compact_file")
|
| 342 |
-
|
| 343 |
-
# Группа ручного выбора
|
| 344 |
with gr.Group():
|
| 345 |
-
gr.Markdown("#### Ручной
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
# Затем шкаф (зависит от автора)
|
| 349 |
-
cabinet_dd = gr.Dropdown(label="2. Выберите шкаф", choices=[], interactive=True)
|
| 350 |
-
|
| 351 |
db_out = gr.Textbox(label="Статус базы", lines=2, max_lines=3, interactive=False)
|
| 352 |
|
| 353 |
-
# Событие загрузки Excel: обновляет статус и список авторов
|
| 354 |
db_in.upload(checker.load_excel_db, inputs=[db_in], outputs=[db_out, author_dd, cabinet_dd])
|
| 355 |
-
|
| 356 |
-
# Событие выбора автора: фильтрует список шкафов
|
| 357 |
author_dd.change(checker.get_cabinets_by_author, inputs=[author_dd], outputs=[cabinet_dd])
|
| 358 |
|
| 359 |
-
# --- ПРАВАЯ КОЛОНКА: ЧЕРТЕЖИ ---
|
| 360 |
with gr.Column(scale=1):
|
| 361 |
gr.Markdown("### 2. Документация")
|
| 362 |
files_in = gr.File(label="Чертежи (PDF)", file_count="multiple", type="filepath", elem_classes="compact_file")
|
|
|
|
| 21 |
def load_excel_db(self, excel_path):
|
| 22 |
print(f"--- Загрузка Excel: {excel_path} ---")
|
| 23 |
if excel_path is None:
|
|
|
|
| 24 |
return "Файл не выбран", gr.update(choices=[], value=None), gr.update(choices=[], value=None)
|
| 25 |
|
| 26 |
all_data = []
|
|
|
|
| 51 |
df = pd.read_excel(excel_path, sheet_name=sheet_name, header=header_row_index)
|
| 52 |
df_subset = df.iloc[:, [cab_col_idx, rem_col_idx]]
|
| 53 |
df_subset.columns = ["Cabinet", "Remark"]
|
|
|
|
| 54 |
df_subset["Author"] = sheet_name
|
| 55 |
|
| 56 |
df_subset["Cabinet"] = df_subset["Cabinet"].ffill()
|
|
|
|
| 65 |
return "❌ Ошибка: Не найдены заголовки 'Шкаф' и 'Примечание'.", gr.update(choices=[]), gr.update(choices=[])
|
| 66 |
|
| 67 |
self.excel_db = pd.concat(all_data, ignore_index=True)
|
|
|
|
|
|
|
| 68 |
authors_list = sorted(self.excel_db["Author"].unique().tolist())
|
| 69 |
|
| 70 |
print(f"Excel загружен. Всего строк: {len(self.excel_db)}")
|
| 71 |
msg = f"✅ База загружена!\nЗаписей: {len(self.excel_db)}\nАвторы: {', '.join(sheets_log)}"
|
| 72 |
|
|
|
|
| 73 |
return msg, gr.update(choices=authors_list, value=None, interactive=True), gr.update(choices=[], value=None)
|
| 74 |
|
| 75 |
except Exception as e:
|
|
|
|
| 77 |
return f"❌ Ошибка: {e}", gr.update(choices=[]), gr.update(choices=[])
|
| 78 |
|
| 79 |
def get_cabinets_by_author(self, author_name):
|
|
|
|
| 80 |
if self.excel_db.empty or not author_name:
|
| 81 |
return gr.update(choices=[], value=None)
|
|
|
|
|
|
|
| 82 |
filtered_cabs = self.excel_db[self.excel_db["Author"] == author_name]["Cabinet"].unique().tolist()
|
| 83 |
return gr.update(choices=sorted(filtered_cabs), value=None, interactive=True)
|
| 84 |
|
|
|
|
| 86 |
text = ""
|
| 87 |
try:
|
| 88 |
with pdfplumber.open(pdf_path) as pdf:
|
|
|
|
| 89 |
for page in pdf.pages[:5]:
|
| 90 |
text += (page.extract_text() or "") + "\n"
|
| 91 |
except Exception as e:
|
|
|
|
| 93 |
return text
|
| 94 |
|
| 95 |
def find_all_decimal_numbers(self, text):
|
|
|
|
| 96 |
matches = []
|
| 97 |
+
|
| 98 |
+
# Шаблон 1: Специфичный (РЛТ.1.006.ША.030)
|
| 99 |
+
# Ищет: Префикс + цифра + 3 цифры + буквы + 3 цифры
|
| 100 |
+
pattern_custom = r"(РЛТ|ЛДАР|ВНАР|ШТМ)[\s\.]*\d{1}[\s\.]*\d{3}[\s\.]*[А-ЯA-Z]{1,4}[\s\.]*\d{3}(-[\d]+)?"
|
| 101 |
+
|
| 102 |
+
# Шаблон 2: Стандартный ГОСТ (ЛДАР.421246.337)
|
| 103 |
+
# Ищет: Префикс + точка + 6 цифр + точка + 3 цифры (допускаются пробелы вместо точек)
|
| 104 |
+
pattern_gost = r"(РЛТ|ЛДАР|ВНАР|ШТМ)[\s\.]*\d{6}[\s\.]*\d{3}"
|
| 105 |
+
|
| 106 |
+
# Ищем по первому шаблону
|
| 107 |
+
for match in re.finditer(pattern_custom, text):
|
| 108 |
clean_num = match.group(0).replace(" ", "").replace("\n", "")
|
| 109 |
if clean_num not in matches:
|
| 110 |
matches.append(clean_num)
|
| 111 |
+
|
| 112 |
+
# Ищем по второму шаблону
|
| 113 |
+
for match in re.finditer(pattern_gost, text):
|
| 114 |
+
clean_num = match.group(0).replace(" ", "").replace("\n", "")
|
| 115 |
+
if clean_num not in matches:
|
| 116 |
+
matches.append(clean_num)
|
| 117 |
+
|
| 118 |
return matches
|
| 119 |
|
| 120 |
def determine_doc_type(self, filename):
|
|
|
|
| 186 |
found_by_method = "manual"
|
| 187 |
is_manual = True
|
| 188 |
else:
|
|
|
|
| 189 |
db_clean_keys = set(self.excel_db["Cabinet_Clean"].tolist())
|
| 190 |
|
| 191 |
for file_path in progress.tqdm(files, desc="Поиск номера шкафа"):
|
| 192 |
+
raw_text = self.extract_text(file_path)
|
|
|
|
| 193 |
|
| 194 |
+
# --- ПОИСК ПО НОМЕРУ (2 ШАБЛОНА) ---
|
| 195 |
+
pdf_numbers = self.find_all_decimal_numbers(raw_text)
|
| 196 |
for cand in pdf_numbers:
|
| 197 |
if cand in db_clean_keys:
|
| 198 |
detected_cabinet = cand
|
| 199 |
found_by_method = "number"
|
| 200 |
+
break
|
| 201 |
|
| 202 |
if found_by_method == "number":
|
| 203 |
+
print(f"✅ Шкаф найден по номеру: {detected_cabinet}")
|
| 204 |
break
|
| 205 |
|
| 206 |
+
# --- ПОИСК ПО ИМЕНИ (УЛУЧШЕННЫЙ) ---
|
| 207 |
+
# Убираем переносы строк, чтобы "Шкаф\nСАУ" стало "Шкаф САУ"
|
| 208 |
+
flat_text = raw_text.replace("\n", " ").replace(" ", " ").lower()
|
| 209 |
+
|
| 210 |
+
unique_cabinets = self.excel_db["Cabinet"].unique()
|
| 211 |
+
for cab_name in unique_cabinets:
|
| 212 |
+
# Ищем только если это похоже на название, а не на код
|
| 213 |
+
if "ЛДАР" in cab_name or "РЛТ" in cab_name: continue
|
| 214 |
+
|
| 215 |
+
# Проверяем точное вхождение названия
|
| 216 |
+
clean_name = cab_name.lower().strip()
|
| 217 |
+
if len(clean_name) > 5 and clean_name in flat_text:
|
| 218 |
+
detected_cabinet = cab_name
|
| 219 |
+
found_by_method = "name"
|
| 220 |
+
print(f"✅ Шкаф найден по имени: {cab_name}")
|
| 221 |
+
break
|
| 222 |
+
|
| 223 |
+
if found_by_method == "name":
|
| 224 |
+
break
|
| 225 |
+
|
| 226 |
print(f"Определен шкаф: {detected_cabinet}")
|
| 227 |
|
| 228 |
if detected_cabinet == "Не определен":
|
|
|
|
| 267 |
form = c.acroForm
|
| 268 |
width, height = A4
|
| 269 |
|
|
|
|
| 270 |
font_name = 'Helvetica'
|
| 271 |
local_font = "arial.ttf"
|
| 272 |
|
|
|
|
| 361 |
gr.Markdown("## ✅ Генератор чек-листов КД")
|
| 362 |
|
| 363 |
with gr.Row():
|
|
|
|
| 364 |
with gr.Column(scale=1):
|
| 365 |
gr.Markdown("### 1. База знаний")
|
| 366 |
db_in = gr.File(label="Excel (.xlsx)", type="filepath", elem_classes="compact_file")
|
|
|
|
|
|
|
| 367 |
with gr.Group():
|
| 368 |
+
gr.Markdown("#### Ручной выбор:")
|
| 369 |
+
author_dd = gr.Dropdown(label="1. Автор", choices=[], interactive=True)
|
| 370 |
+
cabinet_dd = gr.Dropdown(label="2. Шкаф", choices=[], interactive=True)
|
|
|
|
|
|
|
|
|
|
| 371 |
db_out = gr.Textbox(label="Статус базы", lines=2, max_lines=3, interactive=False)
|
| 372 |
|
|
|
|
| 373 |
db_in.upload(checker.load_excel_db, inputs=[db_in], outputs=[db_out, author_dd, cabinet_dd])
|
|
|
|
|
|
|
| 374 |
author_dd.change(checker.get_cabinets_by_author, inputs=[author_dd], outputs=[cabinet_dd])
|
| 375 |
|
|
|
|
| 376 |
with gr.Column(scale=1):
|
| 377 |
gr.Markdown("### 2. Документация")
|
| 378 |
files_in = gr.File(label="Чертежи (PDF)", file_count="multiple", type="filepath", elem_classes="compact_file")
|