Spaces:
Sleeping
Sleeping
| # web_indexer_universal_v7.py | |
| # VÉGLEGES VERZIÓ: GitHub Secrets integrációval és a feltöltött szinonima készlet használatával. | |
| import os | |
| import time | |
| import traceback | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin, urlparse | |
| from collections import deque | |
| from elasticsearch import Elasticsearch, helpers, exceptions as es_exceptions | |
| import sys | |
| import warnings | |
| # === ANSI Színkódok (konzol loggoláshoz) === | |
| GREEN = '\033[92m' | |
| YELLOW = '\033[93m' | |
| RED = '\033[91m' | |
| RESET = '\033[0m' | |
| CYAN = '\033[96m' | |
| # --- Könyvtárak importálása --- | |
| try: | |
| import torch | |
| TORCH_AVAILABLE = True | |
| except ImportError: | |
| TORCH_AVAILABLE = False | |
| print(f"{RED}FIGYELEM: Torch nincs telepítve.{RESET}") | |
| try: | |
| import together | |
| from dotenv import load_dotenv | |
| load_dotenv() # Helyi fejlesztéshez (.env fájl) | |
| together_api_key = os.getenv("TOGETHER_API_KEY") | |
| if not together_api_key: | |
| print(f"{YELLOW}Figyelem: TOGETHER_API_KEY környezeti változó nincs beállítva. LLM funkciók nem működnek.{RESET}") | |
| together_client = None | |
| else: | |
| together_client = together.Together(api_key=together_api_key) | |
| print(f"{GREEN}Together AI kliens inicializálva.{RESET}") | |
| except ImportError: | |
| together_client = None | |
| try: | |
| import tiktoken | |
| tiktoken_encoder = tiktoken.get_encoding("cl100k_base") | |
| TIKTOKEN_AVAILABLE = True | |
| except ImportError: | |
| TIKTOKEN_AVAILABLE = False | |
| try: | |
| import nltk | |
| try: | |
| nltk.data.find('tokenizers/punkt') | |
| except LookupError: | |
| print(f"{CYAN}NLTK 'punkt' letöltése...{RESET}") | |
| nltk.download('punkt', quiet=True) | |
| NLTK_AVAILABLE = True | |
| except ImportError: | |
| NLTK_AVAILABLE = False | |
| try: | |
| from sentence_transformers import SentenceTransformer | |
| SENTENCE_TRANSFORMER_AVAILABLE = True | |
| except ImportError: | |
| SENTENCE_TRANSFORMER_AVAILABLE = False | |
| # --- Konfiguráció --- | |
| # Adatok betöltése környezeti változókból (a GitHub Actions a Secrets-ből adja át) | |
| ES_CLOUD_ID = os.getenv("ES_CLOUD_ID") | |
| ES_API_KEY = os.getenv("ES_API_KEY") | |
| START_URL = "https://www.dunaelektronika.com/" | |
| TARGET_DOMAIN = "dunaelektronika.com" | |
| MAX_DEPTH = 2 | |
| REQUEST_DELAY = 1 | |
| USER_AGENT = "MyPythonCrawler/1.0" | |
| VECTOR_INDEX_NAME = "dunawebindexai" | |
| BATCH_SIZE = 50 | |
| ES_CLIENT_TIMEOUT = 120 | |
| EMBEDDING_MODEL_NAME = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2' | |
| embedding_model = None | |
| EMBEDDING_DIM = None | |
| device = 'cpu' | |
| CHUNK_SIZE_TOKENS = 500 | |
| CHUNK_OVERLAP_TOKENS = 50 | |
| MIN_CHUNK_SIZE_CHARS = 50 | |
| DEBUG_MODE = True | |
| LLM_CHUNK_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1" | |
| # === Index Beállítások & Mapping === | |
| # <<< JAVÍTVA: Ez a verzió már a Kibana-ban létrehozott "synonyms-hu" készletre hivatkozik | |
| INDEX_SETTINGS_SEPARATE_ANALYZER = { | |
| "analysis": { | |
| "filter": { | |
| "hungarian_stop": {"type": "stop", "stopwords": "_hungarian_"}, | |
| "hungarian_stemmer": {"type": "stemmer", "language": "hungarian"}, | |
| "synonym_filter": { | |
| "type": "synonym_graph", | |
| "synonyms_set": "synonyms-hu" # Hivatkozás a feltöltött szinonima készletre | |
| } | |
| }, | |
| "analyzer": { | |
| "hungarian_indexing_analyzer": { | |
| "tokenizer": "standard", | |
| "filter": ["lowercase", "hungarian_stop", "hungarian_stemmer"] | |
| }, | |
| "hungarian_search_analyzer": { | |
| "tokenizer": "standard", | |
| "filter": ["lowercase", "hungarian_stop", "synonym_filter", "hungarian_stemmer"] | |
| } | |
| } | |
| } | |
| } | |
| INDEX_MAPPINGS_WEB = { | |
| "properties": { | |
| "text_content": {"type": "text", "analyzer": "hungarian_indexing_analyzer", "search_analyzer": "hungarian_search_analyzer"}, | |
| "embedding": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"}, # Dimenziót betöltés után frissítjük | |
| "source_origin": {"type": "keyword"}, | |
| "source_url": {"type": "keyword"}, | |
| "source_type": {"type": "keyword"}, | |
| "category": {"type": "keyword"}, | |
| "heading": {"type": "text", "analyzer": "hungarian_indexing_analyzer", "search_analyzer": "hungarian_search_analyzer"}, | |
| "summary": {"type": "text", "analyzer": "hungarian_indexing_analyzer", "search_analyzer": "hungarian_search_analyzer"} | |
| } | |
| } | |
| # --- Segédfüggvények --- | |
| def initialize_es_client(): | |
| if DEBUG_MODE: print("\nKapcsolódás az Elasticsearch-hez a GitHub Secrets adatokkal...") | |
| if not ES_CLOUD_ID: | |
| print(f"{RED}Hiba: ES_CLOUD_ID környezeti változó hiányzik! Ezt a GitHub Secrets-ben kell beállítani.{RESET}") | |
| return None | |
| if not ES_API_KEY: | |
| print(f"{RED}Hiba: ES_API_KEY környezeti változó hiányzik! Ezt a GitHub Secrets-ben kell beállítani.{RESET}") | |
| return None | |
| try: | |
| client = Elasticsearch( | |
| cloud_id=ES_CLOUD_ID, | |
| api_key=ES_API_KEY, | |
| request_timeout=ES_CLIENT_TIMEOUT | |
| ) | |
| if not client.ping(): | |
| raise ConnectionError("Nem sikerült pingelni az Elasticsearch-t.") | |
| if DEBUG_MODE: print(f"{GREEN}Sikeres Elasticsearch kapcsolat!{RESET}") | |
| return client | |
| except Exception as e: | |
| print(f"{RED}Hiba az Elasticsearch kapcsolódás során: {e}{RESET}") | |
| traceback.print_exc() | |
| return None | |
| def load_embedding_model(): | |
| global embedding_model, EMBEDDING_DIM, device | |
| if not TORCH_AVAILABLE or not SENTENCE_TRANSFORMER_AVAILABLE: | |
| EMBEDDING_DIM = 768 | |
| device = 'cpu' | |
| print(f"{RED}Hiba: PyTorch vagy SentenceTransformer nincs telepítve.{RESET}") | |
| return None, EMBEDDING_DIM, device | |
| print(f"\n'{EMBEDDING_MODEL_NAME}' embedding modell betöltése (SentenceTransformer)...") | |
| try: | |
| current_device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| model = SentenceTransformer(EMBEDDING_MODEL_NAME, device=current_device) | |
| print(f"ST modell betöltve, használt eszköz: {model.device}") | |
| dim = model.get_sentence_embedding_dimension() | |
| if not dim: raise ValueError("Nem sikerült meghatározni az embedding dimenziót.") | |
| embedding_model = model | |
| EMBEDDING_DIM = dim | |
| device = current_device | |
| # Dinamikusan frissítjük a mappinget a modell dimenziójával | |
| INDEX_MAPPINGS_WEB["properties"]["embedding"]["dims"] = dim | |
| return embedding_model, EMBEDDING_DIM, device | |
| except Exception as e: | |
| print(f"{RED}Hiba embedding modell betöltésekor: {e}{RESET}") | |
| traceback.print_exc() | |
| embedding_model = None | |
| EMBEDDING_DIM = 768 | |
| device = 'cpu' | |
| return None, EMBEDDING_DIM, device | |
| # ... A többi függvény (generate_categories_with_llm, get_embedding, create_es_index, stb.) | |
| # az eredeti formájában maradhat, mivel azok már helyesen működnek. | |
| # Itt beillesztem őket a teljesség kedvéért. | |
| def generate_categories_with_llm(llm_client, soup, text): | |
| category_list = ['IT biztonsági szolgáltatások', 'szolgáltatások', 'hardver', 'szoftver', 'hírek', | |
| 'audiovizuális konferenciatechnika'] | |
| try: | |
| breadcrumb = soup.find('nav', class_='breadcrumb') | |
| if breadcrumb: | |
| categories = [li.get_text(strip=True) for li in breadcrumb.find_all('li')] | |
| if categories: | |
| final_category_from_html = categories[-1] | |
| for cat in category_list: | |
| if cat.lower() in final_category_from_html.lower(): | |
| print(f"{GREEN} -> Kategória a breadcrumb alapján: '{cat}'{RESET}") | |
| return [cat] | |
| except Exception: | |
| pass | |
| try: | |
| h1_tag = soup.find('h1') | |
| if h1_tag and h1_tag.get_text(strip=True): | |
| h1_text = h1_tag.get_text(strip=True) | |
| for cat in category_list: | |
| if cat.lower() in h1_text.lower(): | |
| print(f"{GREEN} -> Kategória a H1 cím alapján: '{cat}'{RESET}") | |
| return [cat] | |
| except Exception: | |
| pass | |
| if not llm_client: return ['egyéb'] | |
| try: | |
| categories_text = ", ".join([f"'{cat}'" for cat in category_list]) | |
| prompt = f"""Adott egy weboldal szövege. Adj meg egyetlen, rövid kategóriát a következő listából, ami a legjobban jellemzi a tartalmát. A válaszodban csak a kategória szerepeljen, más szöveg nélkül. | |
| Lehetséges kategóriák: {categories_text} | |
| Szöveg: {text[:1000]} | |
| Kategória:""" | |
| response = llm_client.chat.completions.create(model=LLM_CHUNK_MODEL, | |
| messages=[{"role": "user", "content": prompt}], temperature=0.1, | |
| max_tokens=30) | |
| if response and response.choices: | |
| category = response.choices[0].message.content.strip().replace("'", "").replace("`", "") | |
| for cat in category_list: | |
| if cat.lower() in category.lower(): | |
| print(f"{GREEN} -> Kategória LLM generálás alapján: '{cat}'{RESET}") | |
| return [cat] | |
| return ['egyéb'] | |
| else: | |
| return ["egyéb"] | |
| except Exception as e: | |
| print(f"{RED}Hiba LLM kategorizáláskor: {e}{RESET}") | |
| return ['egyéb'] | |
| def generate_summary_with_llm(llm_client, text): | |
| if not llm_client: return text[:300] + "..." | |
| try: | |
| prompt = f"""Készíts egy rövid, de informatív összefoglalót a következő szövegről magyarul. | |
| Szöveg: {text[:4000]} | |
| Összefoglalás:""" | |
| response = llm_client.chat.completions.create(model=LLM_CHUNK_MODEL, | |
| messages=[{"role": "user", "content": prompt}], temperature=0.5, | |
| max_tokens=500) | |
| if response and response.choices: | |
| summary = response.choices[0].message.content.strip() | |
| print(f"{GREEN} -> Sikeres LLM összefoglalás generálás.{RESET}") | |
| return summary | |
| except Exception as e: | |
| print(f"{RED}Hiba LLM összefoglaláskor: {e}{RESET}") | |
| return text[:300] + "..." | |
| def chunk_text_by_tokens(text, chunk_size, chunk_overlap): | |
| if not TIKTOKEN_AVAILABLE: | |
| chunks = [] | |
| start = 0 | |
| while start < len(text): | |
| end = start + (chunk_size * 4) # Approximation | |
| chunks.append(text[start:end]) | |
| start = end - (chunk_overlap * 4) | |
| return chunks | |
| tokens = tiktoken_encoder.encode(text) | |
| chunks = [] | |
| start = 0 | |
| while start < len(tokens): | |
| end = start + chunk_size | |
| chunk_tokens = tokens[start:end] | |
| chunks.append(tiktoken_encoder.decode(chunk_tokens)) | |
| start += chunk_size - chunk_overlap | |
| return chunks | |
| def get_embedding(text): | |
| if not embedding_model: return None | |
| if not text or not isinstance(text, str): return None | |
| try: | |
| return embedding_model.encode(text, normalize_embeddings=True).tolist() | |
| except Exception as e: | |
| print(f"{RED}Hiba embedding közben: {e}{RESET}") | |
| return None | |
| def create_es_index(client, index_name, index_settings, index_mappings): | |
| if DEBUG_MODE: print(f"\nIndex ellenőrzése: '{index_name}'...") | |
| try: | |
| if not client.indices.exists(index=index_name): | |
| print(f"'{index_name}' index létrehozása...") | |
| client.indices.create(index=index_name, settings=index_settings, mappings=index_mappings) | |
| print(f"{GREEN}Index sikeresen létrehozva.{RESET}") | |
| else: | |
| print(f"Index '{index_name}' már létezik.") | |
| return True | |
| except Exception as e: | |
| print(f"{RED}!!! Hiba az index létrehozásakor: {e}{RESET}") | |
| traceback.print_exc() | |
| return False | |
| def extract_text_from_html(html_content): | |
| try: | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| for element in soup(["script", "style", "nav", "footer", "header", "aside", "form"]): | |
| if element: element.decompose() | |
| main_content = soup.find('main') or soup.find('article') or soup.body | |
| if main_content: | |
| text = main_content.get_text(separator='\n', strip=True) | |
| return "\n".join(line for line in text.splitlines() if line.strip()) | |
| except Exception as e: | |
| print(f"{RED}Hiba a HTML tartalom kinyerésekor: {e}{RESET}") | |
| return "" | |
| def extract_and_filter_links(soup, base_url, target_domain): | |
| links = set() | |
| for a_tag in soup.find_all('a', href=True): | |
| href = a_tag['href'].strip() | |
| if href and not href.startswith(('#', 'mailto:', 'javascript:')): | |
| full_url = urljoin(base_url, href) | |
| parsed_url = urlparse(full_url) | |
| if parsed_url.scheme in ['http', 'https'] and parsed_url.netloc == target_domain: | |
| links.add(parsed_url._replace(fragment="").geturl()) | |
| return links | |
| def crawl_and_index_website(start_url, max_depth, es_client, index_name): | |
| visited_urls = set() | |
| urls_to_visit = deque([(start_url, 0)]) | |
| bulk_actions = [] | |
| total_indexed = 0 | |
| target_domain = urlparse(start_url).netloc | |
| print(f"Web crawling indítása: {start_url} (Max mélység: {max_depth}, Cél: {target_domain})") | |
| while urls_to_visit: | |
| current_url, current_depth = urls_to_visit.popleft() | |
| if current_url in visited_urls or current_depth > max_depth: | |
| continue | |
| print(f"\n--- Feldolgozás (Mélység: {current_depth}): {current_url} ---") | |
| visited_urls.add(current_url) | |
| try: | |
| headers = {'User-Agent': USER_AGENT} | |
| response = requests.get(current_url, headers=headers, timeout=15) | |
| response.raise_for_status() | |
| if 'text/html' not in response.headers.get('content-type', '').lower(): | |
| print(f" {YELLOW}-> Nem HTML tartalom, kihagyva.{RESET}") | |
| continue | |
| html_content = response.content | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| page_text = extract_text_from_html(html_content) | |
| if not page_text or len(page_text) < MIN_CHUNK_SIZE_CHARS: | |
| print(f" {YELLOW}-> Nem sikerült szöveget kinyerni vagy túl rövid.{RESET}") | |
| continue | |
| final_chunks = chunk_text_by_tokens(page_text, CHUNK_SIZE_TOKENS, CHUNK_OVERLAP_TOKENS) | |
| url_category = generate_categories_with_llm(together_client, soup, page_text)[0] | |
| page_summary = generate_summary_with_llm(together_client, page_text) | |
| print(f"{GREEN} Indexelésre előkészítve: {len(final_chunks)} darab (Kategória: {url_category}){RESET}") | |
| for chunk_text in final_chunks: | |
| element_vector = get_embedding(chunk_text) | |
| if element_vector: | |
| doc = { | |
| "text_content": chunk_text, | |
| "embedding": element_vector, | |
| "source_origin": "website", | |
| "source_url": current_url, | |
| "source_type": "token_chunking", | |
| "category": url_category, | |
| "summary": page_summary | |
| } | |
| bulk_actions.append({"_index": index_name, "_source": doc}) | |
| if len(bulk_actions) >= BATCH_SIZE: | |
| print(f" -> {len(bulk_actions)} chunk indexelése (batch)...") | |
| success_count, _ = helpers.bulk(es_client, bulk_actions) | |
| total_indexed += success_count | |
| bulk_actions = [] | |
| if current_depth < max_depth: | |
| new_links = extract_and_filter_links(soup, current_url, target_domain) | |
| for link in new_links: | |
| if link not in visited_urls: | |
| urls_to_visit.append((link, current_depth + 1)) | |
| time.sleep(REQUEST_DELAY) | |
| except requests.exceptions.RequestException as req_err: | |
| print(f" {RED}!!! Hiba a letöltés során: {req_err}{RESET}") | |
| except Exception as e: | |
| print(f" {RED}!!! Váratlan hiba a ciklusban ({current_url}): {e}{RESET}") | |
| traceback.print_exc() | |
| if bulk_actions: | |
| print(f" -> Maradék {len(bulk_actions)} chunk indexelése...") | |
| success_count, _ = helpers.bulk(es_client, bulk_actions) | |
| total_indexed += success_count | |
| print(f"\n--- Web Crawling és Indexelés Befejezve ---") | |
| print(f"Meglátogatott URL-ek: {len(visited_urls)}") | |
| print(f"Sikeresen indexelt chunkok: {total_indexed}") | |
| return total_indexed | |
| if __name__ == "__main__": | |
| print(f"----- Web Crawler és Indexelő Indítása -----") | |
| embedding_model, EMBEDDING_DIM, device = load_embedding_model() | |
| if not all([embedding_model, EMBEDDING_DIM]): | |
| print(f"{RED}Hiba: Az embedding modell betöltése sikertelen. A program leáll.{RESET}") | |
| exit(1) | |
| es_client = initialize_es_client() | |
| if es_client: | |
| if es_client.indices.exists(index=VECTOR_INDEX_NAME): | |
| print(f"{YELLOW}Figyelem: A '{VECTOR_INDEX_NAME}' index már létezik. A script feltételezi, hogy a beállításai helyesek.{RESET}") | |
| print(f"{YELLOW}Ha újra akarod építeni, töröld manuálisan: DELETE /{VECTOR_INDEX_NAME}{RESET}") | |
| index_ready = True | |
| else: | |
| index_ready = create_es_index( | |
| client=es_client, | |
| index_name=VECTOR_INDEX_NAME, | |
| index_settings=INDEX_SETTINGS_SEPARATE_ANALYZER, | |
| index_mappings=INDEX_MAPPINGS_WEB | |
| ) | |
| if index_ready: | |
| print(f"\nIndex '{VECTOR_INDEX_NAME}' kész. Web crawling és indexelés indítása...") | |
| final_success_count = crawl_and_index_website(START_URL, MAX_DEPTH, es_client, VECTOR_INDEX_NAME) | |
| if final_success_count > 0: | |
| print(f"\n{GREEN}A folyamat sikeresen lefutott. {final_success_count} dokumentum indexelve.{RESET}") | |
| else: | |
| print(f"\n{YELLOW}A folyamat lefutott, de 0 új dokumentum került indexelésre.{RESET}") | |
| else: | |
| print(f"{RED}Hiba: Az index nem áll készen a használatra. A program leáll.{RESET}") | |
| else: | |
| print(f"{RED}Hiba: Az Elasticsearch kliens nem elérhető. A program leáll.{RESET}") |