Spaces:

SvetlanaSS
/

MyBotBot

Sleeping

App Files Files Community

SvetlanaSS commited on Aug 21

Commit

73b6937

verified ·

1 Parent(s): 65c61a6

Create App.py

Browse files

Files changed (1) hide show

App.py +163 -0

App.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import gradio as gr
+import glob
+from docx import Document
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import torch
+from transformers import T5ForConditionalGeneration, T5Tokenizer
+import numpy as np
+def is_header(txt):
+   if not txt or len(txt) < 35:
+       if txt == txt.upper() and not txt.endswith(('.', ':', '?', '!')):
+           return True
+       if txt.istitle() and len(txt.split()) < 6 and not txt.endswith(('.', ':', '?', '!')):
+           return True
+   return False
+def get_blocks_from_docx():
+   docx_list = glob.glob("*.docx")
+   if not docx_list:
+       return [], []
+   doc = Document(docx_list[0])
+   blocks = []
+   normal_blocks = []
+   for p in doc.paragraphs:
+       txt = p.text.strip()
+       if (
+           txt
+           and not (len(txt) <= 3 and txt.isdigit())
+           and len(txt.split()) > 3
+       ):
+           blocks.append(txt)
+           if not is_header(txt) and len(txt) > 25:
+               normal_blocks.append(txt)
+   for table in doc.tables:
+       for row in table.rows:
+           row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
+           if row_text and len(row_text.split()) > 3 and len(row_text) > 25:
+               blocks.append(row_text)
+               if not is_header(row_text):
+                   normal_blocks.append(row_text)
+   # remove duplicates
+   seen = set(); blocks_clean = []
+   for b in blocks:
+       if b not in seen:
+           blocks_clean.append(b)
+           seen.add(b)
+   seen = set(); normal_blocks_clean = []
+   for b in normal_blocks:
+       if b not in seen:
+           normal_blocks_clean.append(b)
+           seen.add(b)
+   return blocks_clean, normal_blocks_clean
+blocks, normal_blocks = get_blocks_from_docx()
+if not blocks or not normal_blocks:
+   blocks = ["База знаний пуста: проверьте содержимое и структуру вашего .docx!"]
+   normal_blocks = ["База знаний пуста: проверьте содержимое и структуру вашего .docx!"]
+vectorizer = TfidfVectorizer(lowercase=True).fit(blocks)
+matrix = vectorizer.transform(blocks)
+tokenizer = T5Tokenizer.from_pretrained("cointegrated/rut5-base-multitask")
+model = T5ForConditionalGeneration.from_pretrained("cointegrated/rut5-base-multitask")
+model.eval()
+device = 'cpu'
+def rut5_answer(question, context):
+   prompt = f"question: {question} context: {context}"
+   input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
+   with torch.no_grad():
+       output_ids = model.generate(
+           input_ids,
+           max_length=250, num_beams=4, min_length=40,
+           no_repeat_ngram_size=3, do_sample=False
+       )
+   return tokenizer.decode(output_ids[0], skip_special_tokens=True)
+def flatten_index(idx):
+   # Универсальный способ из всего достать int
+   if isinstance(idx, (int, float, np.integer, np.floating)):
+       return int(idx)
+   if isinstance(idx, (list, tuple, np.ndarray)):
+       if len(idx) == 0:
+           return 0
+       return flatten_index(idx)
+   if hasattr(idx, "tolist"):
+       item = idx.tolist()
+       return flatten_index(item)
+   try:
+       return int(idx)
+   except Exception:
+       return 0
+def ask_chatbot(question):
+   question = question.strip()
+   if not question:
+       return "Пожалуйста, введите вопрос."
+   if not normal_blocks or normal_blocks == ["База знаний пуста: проверьте содержимое и структуру вашего .docx!"]:
+       return "Ошибка: база знаний пуста. Проверьте .docx и перезапустите Space."
+   user_vec = vectorizer.transform([question.lower()])
+   sims = cosine_similarity(user_vec, matrix)[0]
+   n_blocks = min(3, len(blocks))
+   if n_blocks == 0:
+       return "Ошибка: база знаний отсутствует или пуста."
+   sorted_idxs = sims.argsort()[-n_blocks:][::-1]
+   context_blocks = []
+   for idx in sorted_idxs:
+       idx_int = flatten_index(idx)
+       if isinstance(idx_int, int) and 0 <= idx_int < len(blocks):
+           context_blocks.append(blocks[idx_int])
+   context = " ".join(context_blocks)
+   # Ответ только из абзацев, не заголовков!
+   best_normal_block = ""
+   max_sim = -1
+   for nb in normal_blocks:
+       v_nb = vectorizer.transform([nb.lower()])
+       sim = cosine_similarity(user_vec, v_nb)[0]
+       if sim > max_sim:
+           max_sim = sim
+           best_normal_block = nb
+   if not best_normal_block:
+       best_normal_block = context_blocks if context_blocks else ""
+   answer = rut5_answer(question, context)
+   if len(answer.strip().split()) < 8 or answer.count('.') < 2:
+       answer += "\n\n" + best_normal_block
+   if is_header(answer):
+       answer = best_normal_block
+   return answer
+EXAMPLES = [
+   "Как оформить список литературы?",
+   "Какие сроки сдачи и защиты ВКР?",
+   "Какой процент оригинальности требуется?",
+   "Как оформлять формулы?"
+]
+with gr.Blocks() as demo:
+   gr.Markdown(
+       "# Русскоязычный Чат-бот по методичке (AI+документ)\nЗадайте вопрос — получите развернутый ответ на основании вашего документа!"
+   )
+   question = gr.Textbox(label="Ваш вопрос", lines=2)
+   ask_btn = gr.Button("Получить ответ")
+   answer = gr.Markdown(label="Ответ", visible=True)
+   def with_spinner(q):
+       yield "Чат-бот думает..."
+       yield ask_chatbot(q)
+   ask_btn.click(with_spinner, question, answer)
+   question.submit(with_spinner, question, answer)
+   gr.Markdown("#### Примеры вопросов:")
+   gr.Examples(EXAMPLES, inputs=question)
+   gr.Markdown("""
+   ---
+   ### Контакты (укажите свои)
+   Преподаватель: ___________________
+   Email: ___________________________
+   Кафедра: _________________________
+   """)
+demo.launch()