import json import os import sys import requests from bs4 import BeautifulSoup import wikipediaapi import numpy as np from sentence_transformers import SentenceTransformer import time print("Загрузка баз данных для эмбеддингов...") model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2') wiki = wikipediaapi.Wikipedia( language='ru', extract_format=wikipediaapi.ExtractFormat.WIKI, user_agent="NewtonBot v1.0 (kolyadual.man@gmail.com)" ) KNOWLEDGE_DIR = "knowledge" os.makedirs(KNOWLEDGE_DIR, exist_ok=True) all_texts = [] habr_urls = [ "https://habr.com/ru/articles/some-python-guide/", "https://habr.com/ru/articles/debian-setup-tips/", ] for url in habr_urls: try: r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}) soup = BeautifulSoup(r.text, 'html.parser') title = soup.find("h1").get_text() if soup.find("h1") else "Без названия" content = soup.find("div", {"id": "post-content-body"}) if content: text = content.get_text()[:2000] all_texts.append({"source": "habr", "title": title, "text": text}) print(f"Готово: {title}") except Exception as e: print(f"Ошибка Habr: {e}") time.sleep(1) recipes = [ { "title": "Борщ классический", "text": "Сварить бульон из говядины. Обжарить свеклу, добавить капусту, картофель, чеснок. Варить 1.5 часа." }, { "title": "Пельмени домашние", "text": "Замесить тесто, сделать фарш из мяса и лука, слепить пельмени, заморозить или сразу варить." } ] all_texts.extend([{"source": "recipe", **r} for r in recipes]) wiki_topics = ["Debian", "GTK", "Искусственный интеллект", "Браузер", "Ubuntu", "Windows", "Zorin OS", "Видеоигра", "Linux Mint", ""] for topic in wiki_topics: page = wiki.page(topic) if page.exists(): all_texts.append({ "source": "wikipedia", "title": "Что такое " + page.title, "text": page.summary[:2000] }) print(f"Wikipedia: {topic}") import praw reddit = praw.Reddit( client_id="2LkJE23ueEU1ur8FjZvNiA", client_secret="FISSVAi2J_QrTVHdoQLzAGs2DnDTIg", user_agent="newtonbot:v1.0" ) subreddits = ["r/Python", "r/debian", "r/selfhosted"] for sr in subreddits: try: for post in reddit.subreddit(sr[2:]).hot(limit=5): all_texts.append({ "source": "reddit", "title": post.title, "text": post.selftext[:2000] or post.title }) except Exception as e: print(f"Reddit: {e}") with open(f"{KNOWLEDGE_DIR}/full_knowledge.json", "w", encoding="utf-8") as f: json.dump(all_texts, f, ensure_ascii=False, indent=2) print("Генерация эмбеддингов...") sentences = [item["text"] for item in all_texts] embeddings = model.encode(sentences, show_progress_bar=True) np.save(f"{KNOWLEDGE_DIR}/embeddings.npy", embeddings) with open(f"{KNOWLEDGE_DIR}/chunks.json", "w", encoding="utf-8") as f: json.dump(all_texts, f, ensure_ascii=False, indent=2) print(f"База знаний готова! {len(all_texts)} фрагментов.")