Kolyadual
/

NewtonBot

Text Generation

Model card Files Files and versions

NewtonBot / embedder.py

Kolyadual's picture

Upload folder using huggingface_hub

9250542 verified about 2 months ago

history blame contribute delete

3.46 kB

	import json
	import os
	import sys
	import requests
	from bs4 import BeautifulSoup
	import wikipediaapi
	import numpy as np
	from sentence_transformers import SentenceTransformer
	import time

	print("Загрузка баз данных для эмбеддингов...")
	model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

	wiki = wikipediaapi.Wikipedia(
	language='ru',
	extract_format=wikipediaapi.ExtractFormat.WIKI,
	user_agent="NewtonBot v1.0 (kolyadual.man@gmail.com)"
	)

	KNOWLEDGE_DIR = "knowledge"
	os.makedirs(KNOWLEDGE_DIR, exist_ok=True)

	all_texts = []

	habr_urls = [
	"https://habr.com/ru/articles/some-python-guide/",
	"https://habr.com/ru/articles/debian-setup-tips/",
	]

	for url in habr_urls:
	try:
	r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
	soup = BeautifulSoup(r.text, 'html.parser')
	title = soup.find("h1").get_text() if soup.find("h1") else "Без названия"
	content = soup.find("div", {"id": "post-content-body"})
	if content:
	text = content.get_text()[:2000]
	all_texts.append({"source": "habr", "title": title, "text": text})
	print(f"Готово: {title}")
	except Exception as e:
	print(f"Ошибка Habr: {e}")
	time.sleep(1)

	recipes = [
	{
	"title": "Борщ классический",
	"text": "Сварить бульон из говядины. Обжарить свеклу, добавить капусту, картофель, чеснок. Варить 1.5 часа."
	},
	{
	"title": "Пельмени домашние",
	"text": "Замесить тесто, сделать фарш из мяса и лука, слепить пельмени, заморозить или сразу варить."
	}
	]

	all_texts.extend([{"source": "recipe", **r} for r in recipes])

	wiki_topics = ["Debian", "GTK", "Искусственный интеллект", "Браузер", "Ubuntu", "Windows", "Zorin OS", "Видеоигра", "Linux Mint", ""]
	for topic in wiki_topics:
	page = wiki.page(topic)
	if page.exists():
	all_texts.append({
	"source": "wikipedia",
	"title": "Что такое " + page.title,
	"text": page.summary[:2000]
	})
	print(f"Wikipedia: {topic}")

	import praw

	reddit = praw.Reddit(
	client_id="2LkJE23ueEU1ur8FjZvNiA",
	client_secret="FISSVAi2J_QrTVHdoQLzAGs2DnDTIg",
	user_agent="newtonbot:v1.0"
	)

	subreddits = ["r/Python", "r/debian", "r/selfhosted"]
	for sr in subreddits:
	try:
	for post in reddit.subreddit(sr[2:]).hot(limit=5):
	all_texts.append({
	"source": "reddit",
	"title": post.title,
	"text": post.selftext[:2000] or post.title
	})
	except Exception as e:
	print(f"Reddit: {e}")

	with open(f"{KNOWLEDGE_DIR}/full_knowledge.json", "w", encoding="utf-8") as f:
	json.dump(all_texts, f, ensure_ascii=False, indent=2)

	print("Генерация эмбеддингов...")
	sentences = [item["text"] for item in all_texts]
	embeddings = model.encode(sentences, show_progress_bar=True)

	np.save(f"{KNOWLEDGE_DIR}/embeddings.npy", embeddings)
	with open(f"{KNOWLEDGE_DIR}/chunks.json", "w", encoding="utf-8") as f:
	json.dump(all_texts, f, ensure_ascii=False, indent=2)

	print(f"База знаний готова! {len(all_texts)} фрагментов.")