Spaces:

darkisz
/

duna-chatbot-backend

Sleeping

App Files Files Community

Király Zoltán commited on Aug 29

Commit

79aa6e9

1 Parent(s): 26ee8cf

Fix: Clean up requirements.txt to resolve build conflicts

Browse files

Files changed (1) hide show

web_indexer_universal_v7.py +7 -10

web_indexer_universal_v7.py CHANGED Viewed

@@ -186,7 +186,6 @@ Kategóriák:"""
         return ["általános"]
 def generate_summary_with_llm(llm_client, text):
-    # Ez a függvény változatlan
     if not llm_client: return text[:300] + "..."
     try:
         prompt = f"""Készíts egy rövid, de informatív összefoglalót a következő szövegről magyarul.
@@ -202,7 +201,6 @@ Szöveg: {text[:4000]}
     return text[:300] + "..."
 def chunk_text_by_tokens(text, chunk_size, chunk_overlap):
-    # Ez a függvény változatlan
     if not TIKTOKEN_AVAILABLE:
         chunks, start = [], 0
         while start < len(text):
@@ -220,7 +218,6 @@ def chunk_text_by_tokens(text, chunk_size, chunk_overlap):
     return chunks
 def get_embedding(text):
-    # Ez a függvény változatlan
     if not embedding_model: return None
     try:
         return embedding_model.encode(text, normalize_embeddings=True).tolist()
@@ -229,7 +226,6 @@ def get_embedding(text):
         return None
 def create_es_index(client, index_name, index_settings, index_mappings):
-    # Ez a függvény változatlan
     print(f"\n{CYAN}Index ellenőrzése: '{index_name}'...{RESET}")
     try:
         if not client.indices.exists(index=index_name):
@@ -244,7 +240,6 @@ def create_es_index(client, index_name, index_settings, index_mappings):
         return False
 def extract_text_from_html(html_content):
-    # Ez a függvény változatlan
     try:
         soup = BeautifulSoup(html_content, 'html.parser')
         for element in soup(["script", "style", "nav", "footer", "header", "aside", "form"]):
@@ -257,7 +252,6 @@ def extract_text_from_html(html_content):
     return ""
 def extract_and_filter_links(soup, base_url, target_domain):
-    # Ez a függvény változatlan
     links = set()
     for a_tag in soup.find_all('a', href=True):
         href = a_tag['href'].strip()
@@ -275,7 +269,11 @@ def crawl_and_index_website(start_url, max_depth, es_client, index_name):
     print(f"Web crawling indítása: {start_url} (Max mélység: {max_depth}, Cél: {target_domain})")
     while urls_to_visit:
-        current_url, current_depth = urls_to_visit.popleft()
         if current_url in visited_urls:
             continue
@@ -299,7 +297,6 @@ def crawl_and_index_website(start_url, max_depth, es_client, index_name):
                 continue
             final_chunks = chunk_text_by_tokens(page_text, CHUNK_SIZE_TOKENS, CHUNK_OVERLAP_TOKENS)
-            # JAVÍTVA: Az új, dinamikus kategória generáló függvény hívása
             categories = generate_dynamic_categories_with_llm(together_client, soup, page_text)
             page_summary = generate_summary_with_llm(together_client, page_text)
@@ -311,7 +308,7 @@ def crawl_and_index_website(start_url, max_depth, es_client, index_name):
                     doc = {
                         "text_content": chunk_text, "embedding": element_vector, "source_origin": "website",
                         "source_url": current_url, "source_type": "token_chunking",
-                        "category": categories, "summary": page_summary
                     }
                     bulk_actions.append({"_index": index_name, "_source": doc})
@@ -322,7 +319,7 @@ def crawl_and_index_website(start_url, max_depth, es_client, index_name):
                 bulk_actions = []
             if current_depth < max_depth:
-                new_links = extract_and_filter_links(soup, current_url, target_domain)
                 for link in new_links:
                     if link not in visited_urls:
                         urls_to_visit.append((link, current_depth + 1))

         return ["általános"]
 def generate_summary_with_llm(llm_client, text):
     if not llm_client: return text[:300] + "..."
     try:
         prompt = f"""Készíts egy rövid, de informatív összefoglalót a következő szövegről magyarul.
     return text[:300] + "..."
 def chunk_text_by_tokens(text, chunk_size, chunk_overlap):
     if not TIKTOKEN_AVAILABLE:
         chunks, start = [], 0
         while start < len(text):
     return chunks
 def get_embedding(text):
     if not embedding_model: return None
     try:
         return embedding_model.encode(text, normalize_embeddings=True).tolist()
         return None
 def create_es_index(client, index_name, index_settings, index_mappings):
     print(f"\n{CYAN}Index ellenőrzése: '{index_name}'...{RESET}")
     try:
         if not client.indices.exists(index=index_name):
         return False
 def extract_text_from_html(html_content):
     try:
         soup = BeautifulSoup(html_content, 'html.parser')
         for element in soup(["script", "style", "nav", "footer", "header", "aside", "form"]):
     return ""
 def extract_and_filter_links(soup, base_url, target_domain):
     links = set()
     for a_tag in soup.find_all('a', href=True):
         href = a_tag['href'].strip()
     print(f"Web crawling indítása: {start_url} (Max mélység: {max_depth}, Cél: {target_domain})")
     while urls_to_visit:
+        try:
+            current_url, current_depth = urls_to_visit.popleft()
+        except IndexError:
+            break # Nincs több URL a listában
         if current_url in visited_urls:
             continue
                 continue
             final_chunks = chunk_text_by_tokens(page_text, CHUNK_SIZE_TOKENS, CHUNK_OVERLAP_TOKENS)
             categories = generate_dynamic_categories_with_llm(together_client, soup, page_text)
             page_summary = generate_summary_with_llm(together_client, page_text)
                     doc = {
                         "text_content": chunk_text, "embedding": element_vector, "source_origin": "website",
                         "source_url": current_url, "source_type": "token_chunking",
+                        "category": categories, "summary": page_summary, "heading": soup.find('h1').get_text(strip=True) if soup.find('h1') else ''
                     }
                     bulk_actions.append({"_index": index_name, "_source": doc})
                 bulk_actions = []
             if current_depth < max_depth:
+                new_links = extract_and_filter_links(soup, start_url, target_domain)
                 for link in new_links:
                     if link not in visited_urls:
                         urls_to_visit.append((link, current_depth + 1))