Spaces:
Sleeping
Sleeping
| # web_indexer_universal_v7.py | |
| # VÉGLEGES VERZIÓ 2.0: Szinonimák nélkül, dinamikus AI kategorizálással. | |
| import os | |
| import time | |
| import traceback | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin, urlparse | |
| from collections import deque | |
| from elasticsearch import Elasticsearch, helpers | |
| import sys | |
| # === ANSI Színkódok === | |
| GREEN = '\033[92m' | |
| YELLOW = '\033[93m' | |
| RED = '\033[91m' | |
| RESET = '\033[0m' | |
| CYAN = '\033[96m' | |
| # --- Könyvtárak importálása és ellenőrzése --- | |
| try: | |
| import torch | |
| TORCH_AVAILABLE = True | |
| except ImportError: | |
| TORCH_AVAILABLE = False | |
| try: | |
| import together | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| together_api_key = os.getenv("TOGETHER_API_KEY") | |
| if not together_api_key: | |
| print(f"{YELLOW}Figyelem: TOGETHER_API_KEY nincs beállítva, LLM funkciók nem működnek.{RESET}") | |
| together_client = None | |
| else: | |
| together_client = together.Together(api_key=together_api_key) | |
| print(f"{GREEN}Together AI kliens inicializálva.{RESET}") | |
| except ImportError: | |
| together_client = None | |
| try: | |
| import tiktoken | |
| tiktoken_encoder = tiktoken.get_encoding("cl100k_base") | |
| TIKTOKEN_AVAILABLE = True | |
| except ImportError: | |
| TIKTOKEN_AVAILABLE = False | |
| try: | |
| import nltk | |
| try: | |
| nltk.data.find('tokenizers/punkt') | |
| except LookupError: | |
| print(f"{CYAN}NLTK 'punkt' letöltése...{RESET}") | |
| nltk.download('punkt', quiet=True) | |
| NLTK_AVAILABLE = True | |
| except ImportError: | |
| NLTK_AVAILABLE = False | |
| try: | |
| from sentence_transformers import SentenceTransformer | |
| SENTENCE_TRANSFORMER_AVAILABLE = True | |
| except ImportError: | |
| SENTENCE_TRANSFORMER_AVAILABLE = False | |
| # --- Konfiguráció --- | |
| ES_CLOUD_ID = os.getenv("ES_CLOUD_ID") | |
| ES_API_KEY = os.getenv("ES_API_KEY") | |
| START_URL = "https://www.dunaelektronika.com/" | |
| TARGET_DOMAIN = "dunaelektronika.com" | |
| MAX_DEPTH = 2 | |
| REQUEST_DELAY = 1 | |
| USER_AGENT = "MyPythonCrawler/1.0" | |
| VECTOR_INDEX_NAME = "dunawebindexai" | |
| BATCH_SIZE = 50 | |
| ES_CLIENT_TIMEOUT = 120 | |
| EMBEDDING_MODEL_NAME = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2' | |
| embedding_model = None | |
| EMBEDDING_DIM = 768 # Alapértelmezett, betöltés után frissítjük | |
| device = 'cpu' | |
| CHUNK_SIZE_TOKENS = 500 | |
| CHUNK_OVERLAP_TOKENS = 50 | |
| MIN_CHUNK_SIZE_CHARS = 50 | |
| LLM_CHUNK_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1" | |
| # === Index Beállítások & Mapping (EGYSZERŰSÍTETT, SZINONIMÁK NÉLKÜL) === | |
| INDEX_SETTINGS_SIMPLE = { | |
| "analysis": { | |
| "filter": { | |
| "hungarian_stop": {"type": "stop", "stopwords": "_hungarian_"}, | |
| "hungarian_stemmer": {"type": "stemmer", "language": "hungarian"} | |
| }, | |
| "analyzer": { | |
| "hungarian_analyzer": { | |
| "tokenizer": "standard", | |
| "filter": ["lowercase", "hungarian_stop", "hungarian_stemmer"] | |
| } | |
| } | |
| } | |
| } | |
| INDEX_MAPPINGS_SIMPLE = { | |
| "properties": { | |
| "text_content": {"type": "text", "analyzer": "hungarian_analyzer"}, | |
| "embedding": {"type": "dense_vector", "dims": EMBEDDING_DIM, "index": True, "similarity": "cosine"}, | |
| "source_origin": {"type": "keyword"}, | |
| "source_url": {"type": "keyword"}, | |
| "source_type": {"type": "keyword"}, | |
| "category": {"type": "keyword"}, # A 'keyword' típus listákat is tud kezelni | |
| "heading": {"type": "text", "analyzer": "hungarian_analyzer"}, | |
| "summary": {"type": "text", "analyzer": "hungarian_analyzer"} | |
| } | |
| } | |
| # --- Segédfüggvények --- | |
| def initialize_es_client(): | |
| print(f"\n{CYAN}Kapcsolódás az Elasticsearch-hez...{RESET}") | |
| if not ES_CLOUD_ID or not ES_API_KEY: | |
| print(f"{RED}Hiba: ES_CLOUD_ID vagy ES_API_KEY hiányzik a GitHub Secrets-ből!{RESET}") | |
| return None | |
| try: | |
| client = Elasticsearch( | |
| cloud_id=ES_CLOUD_ID, | |
| api_key=ES_API_KEY, | |
| request_timeout=ES_CLIENT_TIMEOUT | |
| ) | |
| if not client.ping(): raise ConnectionError("Ping sikertelen.") | |
| print(f"{GREEN}Sikeres Elasticsearch kapcsolat!{RESET}") | |
| return client | |
| except Exception as e: | |
| print(f"{RED}Hiba az Elasticsearch kapcsolódás során: {e}{RESET}") | |
| return None | |
| def load_embedding_model(): | |
| global embedding_model, EMBEDDING_DIM, device | |
| if not (TORCH_AVAILABLE and SENTENCE_TRANSFORMER_AVAILABLE): | |
| print(f"{RED}PyTorch vagy SentenceTransformer nincs telepítve. Embedding nem működik.{RESET}") | |
| return | |
| print(f"\n{CYAN}'{EMBEDDING_MODEL_NAME}' embedding modell betöltése...{RESET}") | |
| try: | |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| model = SentenceTransformer(EMBEDDING_MODEL_NAME, device=device) | |
| embedding_model = model | |
| EMBEDDING_DIM = model.get_sentence_embedding_dimension() | |
| INDEX_MAPPINGS_SIMPLE["properties"]["embedding"]["dims"] = EMBEDDING_DIM | |
| print(f"{GREEN}Embedding modell betöltve (dim: {EMBEDDING_DIM}, eszköz: {device}).{RESET}") | |
| except Exception as e: | |
| print(f"{RED}Hiba az embedding modell betöltésekor: {e}{RESET}") | |
| embedding_model = None | |
| def generate_dynamic_categories_with_llm(llm_client, soup, text): | |
| if not llm_client: return ["általános"] | |
| h1_text = "" | |
| try: | |
| h1_tag = soup.find('h1') | |
| if h1_tag: | |
| h1_text = h1_tag.get_text(strip=True) | |
| except Exception: | |
| pass | |
| try: | |
| prompt = f"""Elemezd a következő magyar nyelvű weboldal tartalmát, és adj meg 1-3 rövid, releváns kategóriát vagy címkét, ami a legjobban leírja azt. A kategóriákat vesszővel válaszd el. A válaszodban csak a kategóriák szerepeljenek, más magyarázat nélkül. | |
| Weboldal címe: "{h1_text}" | |
| Szöveg eleje: {text[:1500]} | |
| Kategóriák:""" | |
| response = llm_client.chat.completions.create( | |
| model=LLM_CHUNK_MODEL, | |
| messages=[{"role": "user", "content": prompt}], | |
| temperature=0.2, | |
| max_tokens=50 | |
| ) | |
| if response and response.choices: | |
| categories_str = response.choices[0].message.content.strip() | |
| # A válasz feldolgozása: vessző mentén darabolás, felesleges szóközök eltávolítása, kisbetűsítés | |
| categories = [cat.strip().lower() for cat in categories_str.split(',') if cat.strip()] | |
| print(f"{GREEN} -> Dinamikus kategóriák az AI alapján: {categories}{RESET}") | |
| return categories if categories else ["általános"] | |
| return ["általános"] | |
| except Exception as e: | |
| print(f"{RED}Hiba a dinamikus LLM kategorizáláskor: {e}{RESET}") | |
| return ["általános"] | |
| def generate_summary_with_llm(llm_client, text): | |
| if not llm_client: return text[:300] + "..." | |
| try: | |
| prompt = f"""Készíts egy rövid, de informatív összefoglalót a következő szövegről magyarul. | |
| Szöveg: {text[:4000]} | |
| Összefoglalás:""" | |
| response = llm_client.chat.completions.create(model=LLM_CHUNK_MODEL, messages=[{"role": "user", "content": prompt}], temperature=0.5, max_tokens=500) | |
| if response and response.choices: | |
| summary = response.choices[0].message.content.strip() | |
| print(f"{GREEN} -> Sikeres LLM összefoglalás generálás.{RESET}") | |
| return summary | |
| except Exception as e: | |
| print(f"{RED}Hiba LLM összefoglaláskor: {e}{RESET}") | |
| return text[:300] + "..." | |
| def chunk_text_by_tokens(text, chunk_size, chunk_overlap): | |
| if not TIKTOKEN_AVAILABLE: | |
| chunks, start = [], 0 | |
| while start < len(text): | |
| end = start + (chunk_size * 4) | |
| chunks.append(text[start:end]) | |
| start = end - (chunk_overlap * 4) | |
| return chunks | |
| tokens = tiktoken_encoder.encode(text) | |
| chunks, start = [], 0 | |
| while start < len(tokens): | |
| end = start + chunk_size | |
| chunk_tokens = tokens[start:end] | |
| chunks.append(tiktoken_encoder.decode(chunk_tokens)) | |
| start += chunk_size - chunk_overlap | |
| return chunks | |
| def get_embedding(text): | |
| if not embedding_model: return None | |
| try: | |
| return embedding_model.encode(text, normalize_embeddings=True).tolist() | |
| except Exception as e: | |
| print(f"{RED}Hiba embedding közben: {e}{RESET}") | |
| return None | |
| def create_es_index(client, index_name, index_settings, index_mappings): | |
| print(f"\n{CYAN}Index ellenőrzése: '{index_name}'...{RESET}") | |
| try: | |
| if not client.indices.exists(index=index_name): | |
| print(f"'{index_name}' index létrehozása...") | |
| client.indices.create(index=index_name, settings=index_settings, mappings=index_mappings) | |
| print(f"{GREEN}Index sikeresen létrehozva.{RESET}") | |
| else: | |
| print(f"Index '{index_name}' már létezik.") | |
| return True | |
| except Exception as e: | |
| print(f"{RED}!!! Hiba az index létrehozásakor: {e}{RESET}") | |
| return False | |
| def extract_text_from_html(html_content): | |
| try: | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| for element in soup(["script", "style", "nav", "footer", "header", "aside", "form"]): | |
| element.decompose() | |
| main_content = soup.find('main') or soup.find('article') or soup.body or soup | |
| text = main_content.get_text(separator='\n', strip=True) | |
| return "\n".join(line for line in text.splitlines() if line.strip()) | |
| except Exception as e: | |
| print(f"{RED}Hiba a HTML tartalom kinyerésekor: {e}{RESET}") | |
| return "" | |
| def extract_and_filter_links(soup, base_url, target_domain): | |
| links = set() | |
| for a_tag in soup.find_all('a', href=True): | |
| href = a_tag['href'].strip() | |
| if href and not href.startswith(('#', 'mailto:', 'javascript:')): | |
| full_url = urljoin(base_url, href) | |
| parsed_url = urlparse(full_url) | |
| if parsed_url.scheme in ['http', 'https'] and parsed_url.netloc == target_domain: | |
| links.add(parsed_url._replace(fragment="").geturl()) | |
| return links | |
| def crawl_and_index_website(start_url, max_depth, es_client, index_name): | |
| visited_urls, urls_to_visit = set(), deque([(start_url, 0)]) | |
| bulk_actions, total_indexed = [], 0 | |
| target_domain = urlparse(start_url).netloc | |
| print(f"Web crawling indítása: {start_url} (Max mélység: {max_depth}, Cél: {target_domain})") | |
| while urls_to_visit: | |
| try: | |
| current_url, current_depth = urls_to_visit.popleft() | |
| except IndexError: | |
| break # Nincs több URL a listában | |
| if current_url in visited_urls: | |
| continue | |
| print(f"\n--- Feldolgozás (Mélység: {current_depth}): {current_url} ---") | |
| visited_urls.add(current_url) | |
| try: | |
| headers = {'User-Agent': USER_AGENT} | |
| response = requests.get(current_url, headers=headers, timeout=15) | |
| response.raise_for_status() | |
| if 'text/html' not in response.headers.get('content-type', '').lower(): | |
| print(f" {YELLOW}-> Nem HTML tartalom, kihagyva.{RESET}") | |
| continue | |
| html_content = response.content | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| page_text = extract_text_from_html(html_content) | |
| if not page_text or len(page_text) < MIN_CHUNK_SIZE_CHARS: | |
| print(f" {YELLOW}-> Nem sikerült szöveget kinyerni vagy túl rövid.{RESET}") | |
| continue | |
| final_chunks = chunk_text_by_tokens(page_text, CHUNK_SIZE_TOKENS, CHUNK_OVERLAP_TOKENS) | |
| categories = generate_dynamic_categories_with_llm(together_client, soup, page_text) | |
| page_summary = generate_summary_with_llm(together_client, page_text) | |
| print(f"{GREEN} Indexelésre előkészítve: {len(final_chunks)} darab (Kategóriák: {categories}){RESET}") | |
| for chunk_text in final_chunks: | |
| element_vector = get_embedding(chunk_text) | |
| if element_vector: | |
| doc = { | |
| "text_content": chunk_text, "embedding": element_vector, "source_origin": "website", | |
| "source_url": current_url, "source_type": "token_chunking", | |
| "category": categories, "summary": page_summary, "heading": soup.find('h1').get_text(strip=True) if soup.find('h1') else '' | |
| } | |
| bulk_actions.append({"_index": index_name, "_source": doc}) | |
| if len(bulk_actions) >= BATCH_SIZE: | |
| print(f" -> {len(bulk_actions)} chunk indexelése (batch)...") | |
| success_count, _ = helpers.bulk(es_client, bulk_actions) | |
| total_indexed += success_count | |
| bulk_actions = [] | |
| if current_depth < max_depth: | |
| new_links = extract_and_filter_links(soup, start_url, target_domain) | |
| for link in new_links: | |
| if link not in visited_urls: | |
| urls_to_visit.append((link, current_depth + 1)) | |
| time.sleep(REQUEST_DELAY) | |
| except requests.exceptions.RequestException as req_err: | |
| print(f" {RED}!!! Hiba a letöltés során: {req_err}{RESET}") | |
| except Exception as e: | |
| print(f" {RED}!!! Váratlan hiba a ciklusban ({current_url}): {e}{RESET}") | |
| if bulk_actions: | |
| print(f" -> Maradék {len(bulk_actions)} chunk indexelése...") | |
| success_count, _ = helpers.bulk(es_client, bulk_actions) | |
| total_indexed += success_count | |
| print(f"\n--- Web Crawling és Indexelés Befejezve ---") | |
| print(f"Meglátogatott URL-ek: {len(visited_urls)}") | |
| print(f"Sikeresen indexelt chunkok: {total_indexed}") | |
| return total_indexed | |
| # === Fő Program === | |
| if __name__ == "__main__": | |
| print("----- Web Crawler és Indexelő Indítása (Dinamikus AI Kategorizálással) -----") | |
| load_embedding_model() | |
| if not embedding_model: | |
| print(f"{RED}Hiba: Az embedding modell betöltése sikertelen. A program leáll.{RESET}") | |
| sys.exit(1) | |
| es_client = initialize_es_client() | |
| if es_client: | |
| try: | |
| if es_client.indices.exists(index=VECTOR_INDEX_NAME): | |
| print(f"{YELLOW}A '{VECTOR_INDEX_NAME}' index már létezik. Törlés...{RESET}") | |
| es_client.indices.delete(index=VECTOR_INDEX_NAME) | |
| print(f"{GREEN}Index sikeresen törölve.{RESET}") | |
| index_ready = create_es_index( | |
| client=es_client, | |
| index_name=VECTOR_INDEX_NAME, | |
| index_settings=INDEX_SETTINGS_SIMPLE, | |
| index_mappings=INDEX_MAPPINGS_SIMPLE | |
| ) | |
| if index_ready: | |
| final_success_count = crawl_and_index_website(START_URL, MAX_DEPTH, es_client, VECTOR_INDEX_NAME) | |
| if final_success_count > 0: | |
| print(f"\n{GREEN}A folyamat sikeresen lefutott. {final_success_count} dokumentum indexelve.{RESET}") | |
| else: | |
| print(f"\n{YELLOW}A folyamat lefutott, de 0 új dokumentum került indexelésre.{RESET}") | |
| else: | |
| print(f"{RED}Hiba: Az index nem áll készen a használatra.{RESET}") | |
| except Exception as e: | |
| print(f"{RED}Hiba a fő programrészben: {e}{RESET}") | |
| else: | |
| print(f"{RED}Hiba: Az Elasticsearch kliens nem elérhető.{RESET}") |