NewtonBot / embedder.py
Kolyadual's picture
Upload folder using huggingface_hub
9250542 verified
import json
import os
import sys
import requests
from bs4 import BeautifulSoup
import wikipediaapi
import numpy as np
from sentence_transformers import SentenceTransformer
import time
print("Загрузка баз данных для эмбеддингов...")
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
wiki = wikipediaapi.Wikipedia(
language='ru',
extract_format=wikipediaapi.ExtractFormat.WIKI,
user_agent="NewtonBot v1.0 (kolyadual.man@gmail.com)"
)
KNOWLEDGE_DIR = "knowledge"
os.makedirs(KNOWLEDGE_DIR, exist_ok=True)
all_texts = []
habr_urls = [
"https://habr.com/ru/articles/some-python-guide/",
"https://habr.com/ru/articles/debian-setup-tips/",
]
for url in habr_urls:
try:
r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(r.text, 'html.parser')
title = soup.find("h1").get_text() if soup.find("h1") else "Без названия"
content = soup.find("div", {"id": "post-content-body"})
if content:
text = content.get_text()[:2000]
all_texts.append({"source": "habr", "title": title, "text": text})
print(f"Готово: {title}")
except Exception as e:
print(f"Ошибка Habr: {e}")
time.sleep(1)
recipes = [
{
"title": "Борщ классический",
"text": "Сварить бульон из говядины. Обжарить свеклу, добавить капусту, картофель, чеснок. Варить 1.5 часа."
},
{
"title": "Пельмени домашние",
"text": "Замесить тесто, сделать фарш из мяса и лука, слепить пельмени, заморозить или сразу варить."
}
]
all_texts.extend([{"source": "recipe", **r} for r in recipes])
wiki_topics = ["Debian", "GTK", "Искусственный интеллект", "Браузер", "Ubuntu", "Windows", "Zorin OS", "Видеоигра", "Linux Mint", ""]
for topic in wiki_topics:
page = wiki.page(topic)
if page.exists():
all_texts.append({
"source": "wikipedia",
"title": "Что такое " + page.title,
"text": page.summary[:2000]
})
print(f"Wikipedia: {topic}")
import praw
reddit = praw.Reddit(
client_id="2LkJE23ueEU1ur8FjZvNiA",
client_secret="FISSVAi2J_QrTVHdoQLzAGs2DnDTIg",
user_agent="newtonbot:v1.0"
)
subreddits = ["r/Python", "r/debian", "r/selfhosted"]
for sr in subreddits:
try:
for post in reddit.subreddit(sr[2:]).hot(limit=5):
all_texts.append({
"source": "reddit",
"title": post.title,
"text": post.selftext[:2000] or post.title
})
except Exception as e:
print(f"Reddit: {e}")
with open(f"{KNOWLEDGE_DIR}/full_knowledge.json", "w", encoding="utf-8") as f:
json.dump(all_texts, f, ensure_ascii=False, indent=2)
print("Генерация эмбеддингов...")
sentences = [item["text"] for item in all_texts]
embeddings = model.encode(sentences, show_progress_bar=True)
np.save(f"{KNOWLEDGE_DIR}/embeddings.npy", embeddings)
with open(f"{KNOWLEDGE_DIR}/chunks.json", "w", encoding="utf-8") as f:
json.dump(all_texts, f, ensure_ascii=False, indent=2)
print(f"База знаний готова! {len(all_texts)} фрагментов.")