| import json | |
| import os | |
| import sys | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import wikipediaapi | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| import time | |
| print("Загрузка баз данных для эмбеддингов...") | |
| model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2') | |
| wiki = wikipediaapi.Wikipedia( | |
| language='ru', | |
| extract_format=wikipediaapi.ExtractFormat.WIKI, | |
| user_agent="NewtonBot v1.0 (kolyadual.man@gmail.com)" | |
| ) | |
| KNOWLEDGE_DIR = "knowledge" | |
| os.makedirs(KNOWLEDGE_DIR, exist_ok=True) | |
| all_texts = [] | |
| habr_urls = [ | |
| "https://habr.com/ru/articles/some-python-guide/", | |
| "https://habr.com/ru/articles/debian-setup-tips/", | |
| ] | |
| for url in habr_urls: | |
| try: | |
| r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}) | |
| soup = BeautifulSoup(r.text, 'html.parser') | |
| title = soup.find("h1").get_text() if soup.find("h1") else "Без названия" | |
| content = soup.find("div", {"id": "post-content-body"}) | |
| if content: | |
| text = content.get_text()[:2000] | |
| all_texts.append({"source": "habr", "title": title, "text": text}) | |
| print(f"Готово: {title}") | |
| except Exception as e: | |
| print(f"Ошибка Habr: {e}") | |
| time.sleep(1) | |
| recipes = [ | |
| { | |
| "title": "Борщ классический", | |
| "text": "Сварить бульон из говядины. Обжарить свеклу, добавить капусту, картофель, чеснок. Варить 1.5 часа." | |
| }, | |
| { | |
| "title": "Пельмени домашние", | |
| "text": "Замесить тесто, сделать фарш из мяса и лука, слепить пельмени, заморозить или сразу варить." | |
| } | |
| ] | |
| all_texts.extend([{"source": "recipe", **r} for r in recipes]) | |
| wiki_topics = ["Debian", "GTK", "Искусственный интеллект", "Браузер", "Ubuntu", "Windows", "Zorin OS", "Видеоигра", "Linux Mint", ""] | |
| for topic in wiki_topics: | |
| page = wiki.page(topic) | |
| if page.exists(): | |
| all_texts.append({ | |
| "source": "wikipedia", | |
| "title": "Что такое " + page.title, | |
| "text": page.summary[:2000] | |
| }) | |
| print(f"Wikipedia: {topic}") | |
| import praw | |
| reddit = praw.Reddit( | |
| client_id="2LkJE23ueEU1ur8FjZvNiA", | |
| client_secret="FISSVAi2J_QrTVHdoQLzAGs2DnDTIg", | |
| user_agent="newtonbot:v1.0" | |
| ) | |
| subreddits = ["r/Python", "r/debian", "r/selfhosted"] | |
| for sr in subreddits: | |
| try: | |
| for post in reddit.subreddit(sr[2:]).hot(limit=5): | |
| all_texts.append({ | |
| "source": "reddit", | |
| "title": post.title, | |
| "text": post.selftext[:2000] or post.title | |
| }) | |
| except Exception as e: | |
| print(f"Reddit: {e}") | |
| with open(f"{KNOWLEDGE_DIR}/full_knowledge.json", "w", encoding="utf-8") as f: | |
| json.dump(all_texts, f, ensure_ascii=False, indent=2) | |
| print("Генерация эмбеддингов...") | |
| sentences = [item["text"] for item in all_texts] | |
| embeddings = model.encode(sentences, show_progress_bar=True) | |
| np.save(f"{KNOWLEDGE_DIR}/embeddings.npy", embeddings) | |
| with open(f"{KNOWLEDGE_DIR}/chunks.json", "w", encoding="utf-8") as f: | |
| json.dump(all_texts, f, ensure_ascii=False, indent=2) | |
| print(f"База знаний готова! {len(all_texts)} фрагментов.") | |