Király Zoltán commited on
Commit
79aa6e9
·
1 Parent(s): 26ee8cf

Fix: Clean up requirements.txt to resolve build conflicts

Browse files
Files changed (1) hide show
  1. web_indexer_universal_v7.py +7 -10
web_indexer_universal_v7.py CHANGED
@@ -186,7 +186,6 @@ Kategóriák:"""
186
  return ["általános"]
187
 
188
  def generate_summary_with_llm(llm_client, text):
189
- # Ez a függvény változatlan
190
  if not llm_client: return text[:300] + "..."
191
  try:
192
  prompt = f"""Készíts egy rövid, de informatív összefoglalót a következő szövegről magyarul.
@@ -202,7 +201,6 @@ Szöveg: {text[:4000]}
202
  return text[:300] + "..."
203
 
204
  def chunk_text_by_tokens(text, chunk_size, chunk_overlap):
205
- # Ez a függvény változatlan
206
  if not TIKTOKEN_AVAILABLE:
207
  chunks, start = [], 0
208
  while start < len(text):
@@ -220,7 +218,6 @@ def chunk_text_by_tokens(text, chunk_size, chunk_overlap):
220
  return chunks
221
 
222
  def get_embedding(text):
223
- # Ez a függvény változatlan
224
  if not embedding_model: return None
225
  try:
226
  return embedding_model.encode(text, normalize_embeddings=True).tolist()
@@ -229,7 +226,6 @@ def get_embedding(text):
229
  return None
230
 
231
  def create_es_index(client, index_name, index_settings, index_mappings):
232
- # Ez a függvény változatlan
233
  print(f"\n{CYAN}Index ellenőrzése: '{index_name}'...{RESET}")
234
  try:
235
  if not client.indices.exists(index=index_name):
@@ -244,7 +240,6 @@ def create_es_index(client, index_name, index_settings, index_mappings):
244
  return False
245
 
246
  def extract_text_from_html(html_content):
247
- # Ez a függvény változatlan
248
  try:
249
  soup = BeautifulSoup(html_content, 'html.parser')
250
  for element in soup(["script", "style", "nav", "footer", "header", "aside", "form"]):
@@ -257,7 +252,6 @@ def extract_text_from_html(html_content):
257
  return ""
258
 
259
  def extract_and_filter_links(soup, base_url, target_domain):
260
- # Ez a függvény változatlan
261
  links = set()
262
  for a_tag in soup.find_all('a', href=True):
263
  href = a_tag['href'].strip()
@@ -275,7 +269,11 @@ def crawl_and_index_website(start_url, max_depth, es_client, index_name):
275
  print(f"Web crawling indítása: {start_url} (Max mélység: {max_depth}, Cél: {target_domain})")
276
 
277
  while urls_to_visit:
278
- current_url, current_depth = urls_to_visit.popleft()
 
 
 
 
279
  if current_url in visited_urls:
280
  continue
281
 
@@ -299,7 +297,6 @@ def crawl_and_index_website(start_url, max_depth, es_client, index_name):
299
  continue
300
 
301
  final_chunks = chunk_text_by_tokens(page_text, CHUNK_SIZE_TOKENS, CHUNK_OVERLAP_TOKENS)
302
- # JAVÍTVA: Az új, dinamikus kategória generáló függvény hívása
303
  categories = generate_dynamic_categories_with_llm(together_client, soup, page_text)
304
  page_summary = generate_summary_with_llm(together_client, page_text)
305
 
@@ -311,7 +308,7 @@ def crawl_and_index_website(start_url, max_depth, es_client, index_name):
311
  doc = {
312
  "text_content": chunk_text, "embedding": element_vector, "source_origin": "website",
313
  "source_url": current_url, "source_type": "token_chunking",
314
- "category": categories, "summary": page_summary
315
  }
316
  bulk_actions.append({"_index": index_name, "_source": doc})
317
 
@@ -322,7 +319,7 @@ def crawl_and_index_website(start_url, max_depth, es_client, index_name):
322
  bulk_actions = []
323
 
324
  if current_depth < max_depth:
325
- new_links = extract_and_filter_links(soup, current_url, target_domain)
326
  for link in new_links:
327
  if link not in visited_urls:
328
  urls_to_visit.append((link, current_depth + 1))
 
186
  return ["általános"]
187
 
188
  def generate_summary_with_llm(llm_client, text):
 
189
  if not llm_client: return text[:300] + "..."
190
  try:
191
  prompt = f"""Készíts egy rövid, de informatív összefoglalót a következő szövegről magyarul.
 
201
  return text[:300] + "..."
202
 
203
  def chunk_text_by_tokens(text, chunk_size, chunk_overlap):
 
204
  if not TIKTOKEN_AVAILABLE:
205
  chunks, start = [], 0
206
  while start < len(text):
 
218
  return chunks
219
 
220
  def get_embedding(text):
 
221
  if not embedding_model: return None
222
  try:
223
  return embedding_model.encode(text, normalize_embeddings=True).tolist()
 
226
  return None
227
 
228
  def create_es_index(client, index_name, index_settings, index_mappings):
 
229
  print(f"\n{CYAN}Index ellenőrzése: '{index_name}'...{RESET}")
230
  try:
231
  if not client.indices.exists(index=index_name):
 
240
  return False
241
 
242
  def extract_text_from_html(html_content):
 
243
  try:
244
  soup = BeautifulSoup(html_content, 'html.parser')
245
  for element in soup(["script", "style", "nav", "footer", "header", "aside", "form"]):
 
252
  return ""
253
 
254
  def extract_and_filter_links(soup, base_url, target_domain):
 
255
  links = set()
256
  for a_tag in soup.find_all('a', href=True):
257
  href = a_tag['href'].strip()
 
269
  print(f"Web crawling indítása: {start_url} (Max mélység: {max_depth}, Cél: {target_domain})")
270
 
271
  while urls_to_visit:
272
+ try:
273
+ current_url, current_depth = urls_to_visit.popleft()
274
+ except IndexError:
275
+ break # Nincs több URL a listában
276
+
277
  if current_url in visited_urls:
278
  continue
279
 
 
297
  continue
298
 
299
  final_chunks = chunk_text_by_tokens(page_text, CHUNK_SIZE_TOKENS, CHUNK_OVERLAP_TOKENS)
 
300
  categories = generate_dynamic_categories_with_llm(together_client, soup, page_text)
301
  page_summary = generate_summary_with_llm(together_client, page_text)
302
 
 
308
  doc = {
309
  "text_content": chunk_text, "embedding": element_vector, "source_origin": "website",
310
  "source_url": current_url, "source_type": "token_chunking",
311
+ "category": categories, "summary": page_summary, "heading": soup.find('h1').get_text(strip=True) if soup.find('h1') else ''
312
  }
313
  bulk_actions.append({"_index": index_name, "_source": doc})
314
 
 
319
  bulk_actions = []
320
 
321
  if current_depth < max_depth:
322
+ new_links = extract_and_filter_links(soup, start_url, target_domain)
323
  for link in new_links:
324
  if link not in visited_urls:
325
  urls_to_visit.append((link, current_depth + 1))