Spaces:

Wplotnikow
/

TeacherChat

Sleeping

App Files Files Community

TeacherChat / app.py

Wplotnikow

Update app.py

2b9cf4d verified 8 months ago

raw

history blame

4.87 kB

	import gradio as gr
	import glob
	from docx import Document
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity

	import torch
	from transformers import T5ForConditionalGeneration, T5Tokenizer

	def get_blocks_from_docx():
	docx_list = glob.glob("*.docx")
	if not docx_list:
	return ["Файл .docx не найден!"]
	doc = Document(docx_list[0])
	blocks = []
	for p in doc.paragraphs:
	txt = p.text.strip()
	# Убираем короткие заголовки:
	if (
	txt
	and not (len(txt) <= 3 and txt.isdigit())
	and not (
	len(txt) < 35
	and txt == txt.upper()
	and txt.endswith(('.', ':', '?', '!')) is False
	)
	and len(txt.split()) > 3 # минимум 3 слова = явно не заголовок
	):
	blocks.append(txt)
	# Таблицы:
	for table in doc.tables:
	for row in table.rows:
	row_text = " \| ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
	# Аналогично — игнорируем сверхкороткие строки/возможные заголовки из таблиц:
	if row_text and len(row_text) > 35 and len(row_text.split()) > 3:
	blocks.append(row_text)
	seen = set()
	uniq_blocks = []
	for b in blocks:
	if b not in seen:
	uniq_blocks.append(b)
	seen.add(b)
	return uniq_blocks

	blocks = get_blocks_from_docx()
	vectorizer = TfidfVectorizer().fit(blocks)
	matrix = vectorizer.transform(blocks)

	tokenizer = T5Tokenizer.from_pretrained("cointegrated/rut5-base-multitask")
	model = T5ForConditionalGeneration.from_pretrained("cointegrated/rut5-base-multitask")
	model.eval()
	device = 'cpu'

	def rut5_answer(question, context):
	prompt = f"question: {question} context: {context}"
	input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
	with torch.no_grad():
	output_ids = model.generate(input_ids, max_length=250, num_beams=4, min_length=40, no_repeat_ngram_size=3)
	return tokenizer.decode(output_ids[0], skip_special_tokens=True)

	def ask_chatbot(question):
	if not question.strip():
	return "Пожалуйста, введите вопрос."
	if len(blocks) < 2:
	return "Ошибка: база знаний пуста или слишком мала. Проверьте .docx."
	user_vec = vectorizer.transform([question])
	sims = cosine_similarity(user_vec, matrix)
	# ТОП-3 самых осмысленных блока
	top_idxs = sims.argsort()[-3:][::-1]
	# Используем только НЕКОРОТКИЕ блоки как контекст
	context_blocks = [
	blocks[i] for i in top_idxs if sims[i] > 0.08 and len(blocks[i].split()) > 3 and len(blocks[i]) > 35
	]
	context = " ".join(context_blocks)
	answer = rut5_answer(question, context)
	# Подстраховка — если ответ ТОЛЬКО заголовок, просто версифицируем и дополняем контекстом:
	if len(answer.strip().split()) < 8 or len(answer.split('.')) < 2:
	answer += "\n\n" + context
	return answer

	EXAMPLES = [
	"Какие требования к объему магистерской диссертации?",
	"Как оформить список литературы?",
	"Какие сроки сдачи и защиты ВКР?",
	"Что должно быть во введении?",
	"Какой процент оригинальности требуется?",
	"Как оформлять формулы?"
	]

	with gr.Blocks() as demo:
	gr.Markdown(
	"""
	# Русскоязычный FAQ-чат-бот на базе вашей методички и нейросетевой модели

	Задайте вопрос — получите развернутый AI-ответ на русском языке на основании вашего документа!
	"""
	)
	question = gr.Textbox(label="Ваш вопрос", lines=2)
	ask_btn = gr.Button("Получить ответ")
	answer = gr.Markdown(label="Ответ", visible=True)
	ask_btn.click(ask_chatbot, question, answer)
	question.submit(ask_chatbot, question, answer)
	gr.Markdown("#### Примеры вопросов:")

	# ВОЗВРАЩАЕМ КЛИКАБЕЛЬНЫЕ примеры
	gr.Examples(EXAMPLES, inputs=question)

	gr.Markdown("""
	---

	### Контакты (укажите свои)
	Преподаватель: ___________________
	Email: ___________________________
	Кафедра: _________________________
	""")

	demo.launch()