Spaces:
Sleeping
Sleeping
Király Zoltán
commited on
Commit
·
79aa6e9
1
Parent(s):
26ee8cf
Fix: Clean up requirements.txt to resolve build conflicts
Browse files- web_indexer_universal_v7.py +7 -10
web_indexer_universal_v7.py
CHANGED
|
@@ -186,7 +186,6 @@ Kategóriák:"""
|
|
| 186 |
return ["általános"]
|
| 187 |
|
| 188 |
def generate_summary_with_llm(llm_client, text):
|
| 189 |
-
# Ez a függvény változatlan
|
| 190 |
if not llm_client: return text[:300] + "..."
|
| 191 |
try:
|
| 192 |
prompt = f"""Készíts egy rövid, de informatív összefoglalót a következő szövegről magyarul.
|
|
@@ -202,7 +201,6 @@ Szöveg: {text[:4000]}
|
|
| 202 |
return text[:300] + "..."
|
| 203 |
|
| 204 |
def chunk_text_by_tokens(text, chunk_size, chunk_overlap):
|
| 205 |
-
# Ez a függvény változatlan
|
| 206 |
if not TIKTOKEN_AVAILABLE:
|
| 207 |
chunks, start = [], 0
|
| 208 |
while start < len(text):
|
|
@@ -220,7 +218,6 @@ def chunk_text_by_tokens(text, chunk_size, chunk_overlap):
|
|
| 220 |
return chunks
|
| 221 |
|
| 222 |
def get_embedding(text):
|
| 223 |
-
# Ez a függvény változatlan
|
| 224 |
if not embedding_model: return None
|
| 225 |
try:
|
| 226 |
return embedding_model.encode(text, normalize_embeddings=True).tolist()
|
|
@@ -229,7 +226,6 @@ def get_embedding(text):
|
|
| 229 |
return None
|
| 230 |
|
| 231 |
def create_es_index(client, index_name, index_settings, index_mappings):
|
| 232 |
-
# Ez a függvény változatlan
|
| 233 |
print(f"\n{CYAN}Index ellenőrzése: '{index_name}'...{RESET}")
|
| 234 |
try:
|
| 235 |
if not client.indices.exists(index=index_name):
|
|
@@ -244,7 +240,6 @@ def create_es_index(client, index_name, index_settings, index_mappings):
|
|
| 244 |
return False
|
| 245 |
|
| 246 |
def extract_text_from_html(html_content):
|
| 247 |
-
# Ez a függvény változatlan
|
| 248 |
try:
|
| 249 |
soup = BeautifulSoup(html_content, 'html.parser')
|
| 250 |
for element in soup(["script", "style", "nav", "footer", "header", "aside", "form"]):
|
|
@@ -257,7 +252,6 @@ def extract_text_from_html(html_content):
|
|
| 257 |
return ""
|
| 258 |
|
| 259 |
def extract_and_filter_links(soup, base_url, target_domain):
|
| 260 |
-
# Ez a függvény változatlan
|
| 261 |
links = set()
|
| 262 |
for a_tag in soup.find_all('a', href=True):
|
| 263 |
href = a_tag['href'].strip()
|
|
@@ -275,7 +269,11 @@ def crawl_and_index_website(start_url, max_depth, es_client, index_name):
|
|
| 275 |
print(f"Web crawling indítása: {start_url} (Max mélység: {max_depth}, Cél: {target_domain})")
|
| 276 |
|
| 277 |
while urls_to_visit:
|
| 278 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
if current_url in visited_urls:
|
| 280 |
continue
|
| 281 |
|
|
@@ -299,7 +297,6 @@ def crawl_and_index_website(start_url, max_depth, es_client, index_name):
|
|
| 299 |
continue
|
| 300 |
|
| 301 |
final_chunks = chunk_text_by_tokens(page_text, CHUNK_SIZE_TOKENS, CHUNK_OVERLAP_TOKENS)
|
| 302 |
-
# JAVÍTVA: Az új, dinamikus kategória generáló függvény hívása
|
| 303 |
categories = generate_dynamic_categories_with_llm(together_client, soup, page_text)
|
| 304 |
page_summary = generate_summary_with_llm(together_client, page_text)
|
| 305 |
|
|
@@ -311,7 +308,7 @@ def crawl_and_index_website(start_url, max_depth, es_client, index_name):
|
|
| 311 |
doc = {
|
| 312 |
"text_content": chunk_text, "embedding": element_vector, "source_origin": "website",
|
| 313 |
"source_url": current_url, "source_type": "token_chunking",
|
| 314 |
-
"category": categories, "summary": page_summary
|
| 315 |
}
|
| 316 |
bulk_actions.append({"_index": index_name, "_source": doc})
|
| 317 |
|
|
@@ -322,7 +319,7 @@ def crawl_and_index_website(start_url, max_depth, es_client, index_name):
|
|
| 322 |
bulk_actions = []
|
| 323 |
|
| 324 |
if current_depth < max_depth:
|
| 325 |
-
new_links = extract_and_filter_links(soup,
|
| 326 |
for link in new_links:
|
| 327 |
if link not in visited_urls:
|
| 328 |
urls_to_visit.append((link, current_depth + 1))
|
|
|
|
| 186 |
return ["általános"]
|
| 187 |
|
| 188 |
def generate_summary_with_llm(llm_client, text):
|
|
|
|
| 189 |
if not llm_client: return text[:300] + "..."
|
| 190 |
try:
|
| 191 |
prompt = f"""Készíts egy rövid, de informatív összefoglalót a következő szövegről magyarul.
|
|
|
|
| 201 |
return text[:300] + "..."
|
| 202 |
|
| 203 |
def chunk_text_by_tokens(text, chunk_size, chunk_overlap):
|
|
|
|
| 204 |
if not TIKTOKEN_AVAILABLE:
|
| 205 |
chunks, start = [], 0
|
| 206 |
while start < len(text):
|
|
|
|
| 218 |
return chunks
|
| 219 |
|
| 220 |
def get_embedding(text):
|
|
|
|
| 221 |
if not embedding_model: return None
|
| 222 |
try:
|
| 223 |
return embedding_model.encode(text, normalize_embeddings=True).tolist()
|
|
|
|
| 226 |
return None
|
| 227 |
|
| 228 |
def create_es_index(client, index_name, index_settings, index_mappings):
|
|
|
|
| 229 |
print(f"\n{CYAN}Index ellenőrzése: '{index_name}'...{RESET}")
|
| 230 |
try:
|
| 231 |
if not client.indices.exists(index=index_name):
|
|
|
|
| 240 |
return False
|
| 241 |
|
| 242 |
def extract_text_from_html(html_content):
|
|
|
|
| 243 |
try:
|
| 244 |
soup = BeautifulSoup(html_content, 'html.parser')
|
| 245 |
for element in soup(["script", "style", "nav", "footer", "header", "aside", "form"]):
|
|
|
|
| 252 |
return ""
|
| 253 |
|
| 254 |
def extract_and_filter_links(soup, base_url, target_domain):
|
|
|
|
| 255 |
links = set()
|
| 256 |
for a_tag in soup.find_all('a', href=True):
|
| 257 |
href = a_tag['href'].strip()
|
|
|
|
| 269 |
print(f"Web crawling indítása: {start_url} (Max mélység: {max_depth}, Cél: {target_domain})")
|
| 270 |
|
| 271 |
while urls_to_visit:
|
| 272 |
+
try:
|
| 273 |
+
current_url, current_depth = urls_to_visit.popleft()
|
| 274 |
+
except IndexError:
|
| 275 |
+
break # Nincs több URL a listában
|
| 276 |
+
|
| 277 |
if current_url in visited_urls:
|
| 278 |
continue
|
| 279 |
|
|
|
|
| 297 |
continue
|
| 298 |
|
| 299 |
final_chunks = chunk_text_by_tokens(page_text, CHUNK_SIZE_TOKENS, CHUNK_OVERLAP_TOKENS)
|
|
|
|
| 300 |
categories = generate_dynamic_categories_with_llm(together_client, soup, page_text)
|
| 301 |
page_summary = generate_summary_with_llm(together_client, page_text)
|
| 302 |
|
|
|
|
| 308 |
doc = {
|
| 309 |
"text_content": chunk_text, "embedding": element_vector, "source_origin": "website",
|
| 310 |
"source_url": current_url, "source_type": "token_chunking",
|
| 311 |
+
"category": categories, "summary": page_summary, "heading": soup.find('h1').get_text(strip=True) if soup.find('h1') else ''
|
| 312 |
}
|
| 313 |
bulk_actions.append({"_index": index_name, "_source": doc})
|
| 314 |
|
|
|
|
| 319 |
bulk_actions = []
|
| 320 |
|
| 321 |
if current_depth < max_depth:
|
| 322 |
+
new_links = extract_and_filter_links(soup, start_url, target_domain)
|
| 323 |
for link in new_links:
|
| 324 |
if link not in visited_urls:
|
| 325 |
urls_to_visit.append((link, current_depth + 1))
|