File size: 3,455 Bytes
import json
import os
import sys
import requests
from bs4 import BeautifulSoup
import wikipediaapi
import numpy as np
from sentence_transformers import SentenceTransformer
import time

print("Загрузка баз данных для эмбеддингов...")
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

wiki = wikipediaapi.Wikipedia(
    language='ru',
    extract_format=wikipediaapi.ExtractFormat.WIKI,
    user_agent="NewtonBot v1.0 (kolyadual.man@gmail.com)"
)

KNOWLEDGE_DIR = "knowledge"
os.makedirs(KNOWLEDGE_DIR, exist_ok=True)

all_texts = []

habr_urls = [
    "https://habr.com/ru/articles/some-python-guide/",
    "https://habr.com/ru/articles/debian-setup-tips/",
]

for url in habr_urls:
    try:
        r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        soup = BeautifulSoup(r.text, 'html.parser')
        title = soup.find("h1").get_text() if soup.find("h1") else "Без названия"
        content = soup.find("div", {"id": "post-content-body"})
        if content:
            text = content.get_text()[:2000]
            all_texts.append({"source": "habr", "title": title, "text": text})
            print(f"Готово: {title}")
    except Exception as e:
        print(f"Ошибка Habr: {e}")
    time.sleep(1)

recipes = [
    {
        "title": "Борщ классический",
        "text": "Сварить бульон из говядины. Обжарить свеклу, добавить капусту, картофель, чеснок. Варить 1.5 часа."
    },
    {
        "title": "Пельмени домашние",
        "text": "Замесить тесто, сделать фарш из мяса и лука, слепить пельмени, заморозить или сразу варить."
    }
]

all_texts.extend([{"source": "recipe", **r} for r in recipes])

wiki_topics = ["Debian", "GTK", "Искусственный интеллект", "Браузер", "Ubuntu", "Windows", "Zorin OS", "Видеоигра", "Linux Mint", ""]
for topic in wiki_topics:
    page = wiki.page(topic)
    if page.exists():
        all_texts.append({
            "source": "wikipedia",
            "title": "Что такое " + page.title,
            "text": page.summary[:2000]
        })
        print(f"Wikipedia: {topic}")

import praw

reddit = praw.Reddit(
    client_id="2LkJE23ueEU1ur8FjZvNiA",
    client_secret="FISSVAi2J_QrTVHdoQLzAGs2DnDTIg",
    user_agent="newtonbot:v1.0"
)

subreddits = ["r/Python", "r/debian", "r/selfhosted"]
for sr in subreddits:
    try:
        for post in reddit.subreddit(sr[2:]).hot(limit=5):
            all_texts.append({
                "source": "reddit",
                "title": post.title,
                "text": post.selftext[:2000] or post.title
            })
    except Exception as e:
        print(f"Reddit: {e}")

with open(f"{KNOWLEDGE_DIR}/full_knowledge.json", "w", encoding="utf-8") as f:
    json.dump(all_texts, f, ensure_ascii=False, indent=2)

print("Генерация эмбеддингов...")
sentences = [item["text"] for item in all_texts]
embeddings = model.encode(sentences, show_progress_bar=True)

np.save(f"{KNOWLEDGE_DIR}/embeddings.npy", embeddings)
with open(f"{KNOWLEDGE_DIR}/chunks.json", "w", encoding="utf-8") as f:
    json.dump(all_texts, f, ensure_ascii=False, indent=2)

print(f"База знаний готова! {len(all_texts)} фрагментов.")