Spaces:
Build error
Build error
| import os | |
| import glob | |
| import nltk | |
| from huggingface_hub import login | |
| import gradio as gr | |
| # --- Загрузка NLTK --- | |
| try: | |
| nltk.data.find('tokenizers/punkt') | |
| except LookupError: | |
| nltk.download('punkt', quiet=True) | |
| from langchain_community.document_loaders import UnstructuredWordDocumentLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| from langchain.chains import RetrievalQA | |
| from langchain_huggingface import HuggingFaceEndpoint | |
| # --- Константы --- | |
| FAISS_INDEX_PATH = "faiss_index" | |
| MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" | |
| REPO_ID = "google/flan-t5-large" | |
| DOCX_FILE_PATH = "" | |
| # --- Создание базы знаний --- | |
| def create_vector_db(): | |
| global DOCX_FILE_PATH | |
| if not os.path.exists(FAISS_INDEX_PATH): | |
| print("База знаний не найдена. Создаю новую...") | |
| docx_files = glob.glob("*.docx") | |
| if not docx_files: | |
| raise FileNotFoundError("Ошибка: Не найден .docx файл.") | |
| DOCX_FILE_PATH = docx_files[0] | |
| loader = UnstructuredWordDocumentLoader(DOCX_FILE_PATH) | |
| documents = loader.load() | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150) | |
| docs = text_splitter.split_documents(documents) | |
| embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME) | |
| db = FAISS.from_documents(docs, embeddings) | |
| db.save_local(FAISS_INDEX_PATH) | |
| print("База знаний создана.") | |
| else: | |
| print("База знаний найдена.") | |
| docx_files = glob.glob("*.docx") | |
| if docx_files: | |
| DOCX_FILE_PATH = docx_files[0] | |
| # --- Инициализация QA --- | |
| def initialize_qa_chain(): | |
| HF_TOKEN = os.environ.get("HUGGINGFACEHUB_API_TOKEN") | |
| if not HF_TOKEN: | |
| raise ValueError("Не найден HUGGINGFACEHUB_API_TOKEN.") | |
| login(token=HF_TOKEN) | |
| embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME) | |
| db = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True) | |
| llm = HuggingFaceEndpoint( | |
| repo_id=REPO_ID, | |
| max_new_tokens=512, | |
| temperature=0.3, | |
| repetition_penalty=1.1, | |
| huggingfacehub_api_token=HF_TOKEN | |
| ) | |
| return RetrievalQA.from_chain_type( | |
| llm=llm, | |
| chain_type="stuff", | |
| retriever=db.as_retriever(search_kwargs={"k": 3}) | |
| ) | |
| # --- Основной код --- | |
| create_vector_db() | |
| qa_chain = initialize_qa_chain() | |
| def chatbot_response(message, history): | |
| response = qa_chain.invoke(message) | |
| return response["result"] | |
| # --- Интерфейс --- | |
| with gr.Blocks(theme='gradio/soft', title="AI-Ассистент по ВКР") as demo: | |
| gr.Markdown("# 🤖 AI-Ассистент по вопросам ВКР") | |
| if DOCX_FILE_PATH: | |
| gr.Markdown(f"Бот отвечает на вопросы на основе документа: **{os.path.basename(DOCX_FILE_PATH)}**.") | |
| gr.ChatInterface( | |
| fn=chatbot_response, | |
| title=None, | |
| examples=[ | |
| "Какие требования к объему магистерской диссертации?", | |
| "Как правильно оформить список литературы?", | |
| "Какие сроки сдачи и защиты ВКР в этом году?", | |
| "Что должно быть во введении?", | |
| "Какой процент оригинальности требуется?" | |
| ] | |
| ) | |
| demo.launch() |