datbkpro commited on
Commit
6469f7e
·
verified ·
1 Parent(s): 8be02b3

Update core/wikipedia_processor.py

Browse files
Files changed (1) hide show
  1. core/wikipedia_processor.py +52 -15
core/wikipedia_processor.py CHANGED
@@ -1,9 +1,3 @@
1
- import os
2
- import json
3
- import pandas as pd
4
- from typing import List
5
- from models.schemas import RAGDocument
6
-
7
  class WikipediaProcessor:
8
  def __init__(self):
9
  self.supported_formats = ['.txt', '.csv', '.json']
@@ -13,6 +7,8 @@ class WikipediaProcessor:
13
  file_ext = os.path.splitext(file_path)[1].lower()
14
 
15
  try:
 
 
16
  if file_ext == '.txt':
17
  return self._process_txt_file(file_path)
18
  elif file_ext == '.csv':
@@ -22,36 +18,73 @@ class WikipediaProcessor:
22
  else:
23
  raise ValueError(f"Định dạng file không được hỗ trợ: {file_ext}")
24
  except Exception as e:
 
25
  raise Exception(f"Lỗi xử lý file: {str(e)}")
26
 
27
  def _process_txt_file(self, file_path: str) -> List[str]:
28
  """Xử lý file text"""
29
- with open(file_path, 'r', encoding='utf-8') as f:
30
- content = f.read()
31
-
32
- paragraphs = [p.strip() for p in content.split('\n\n') if p.strip() and len(p.strip()) > 20]
33
- return paragraphs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  def _process_csv_file(self, file_path: str) -> List[str]:
36
  """Xử lý file CSV"""
 
37
  try:
38
  df = pd.read_csv(file_path)
39
  documents = []
40
 
41
- for _, row in df.iterrows():
 
 
42
  doc_parts = []
43
  for col in df.columns:
44
  if pd.notna(row[col]) and str(row[col]).strip():
45
  doc_parts.append(f"{col}: {row[col]}")
 
46
  if doc_parts:
47
- documents.append(" | ".join(doc_parts))
 
 
 
 
 
48
 
 
49
  return documents
50
  except Exception as e:
 
51
  raise Exception(f"Lỗi đọc CSV: {str(e)}")
52
 
53
  def _process_json_file(self, file_path: str) -> List[str]:
54
  """Xử lý file JSON"""
 
55
  try:
56
  with open(file_path, 'r', encoding='utf-8') as f:
57
  data = json.load(f)
@@ -63,12 +96,16 @@ class WikipediaProcessor:
63
  for key, value in obj.items():
64
  extract_text(value, f"{current_path}.{key}" if current_path else key)
65
  elif isinstance(obj, list):
66
- for item in obj:
67
- extract_text(item, current_path)
68
  elif isinstance(obj, str) and len(obj.strip()) > 10:
69
  documents.append(f"{current_path}: {obj.strip()}")
 
 
70
 
71
  extract_text(data)
 
72
  return documents
73
  except Exception as e:
 
74
  raise Exception(f"Lỗi đọc JSON: {str(e)}")
 
 
 
 
 
 
 
1
  class WikipediaProcessor:
2
  def __init__(self):
3
  self.supported_formats = ['.txt', '.csv', '.json']
 
7
  file_ext = os.path.splitext(file_path)[1].lower()
8
 
9
  try:
10
+ print(f"🔄 Đang xử lý file: {file_path}, định dạng: {file_ext}")
11
+
12
  if file_ext == '.txt':
13
  return self._process_txt_file(file_path)
14
  elif file_ext == '.csv':
 
18
  else:
19
  raise ValueError(f"Định dạng file không được hỗ trợ: {file_ext}")
20
  except Exception as e:
21
+ print(f"❌ Lỗi xử lý file: {traceback.format_exc()}")
22
  raise Exception(f"Lỗi xử lý file: {str(e)}")
23
 
24
  def _process_txt_file(self, file_path: str) -> List[str]:
25
  """Xử lý file text"""
26
+ print(f"📖 Đọc file text: {file_path}")
27
+ try:
28
+ with open(file_path, 'r', encoding='utf-8') as f:
29
+ content = f.read()
30
+
31
+ # Multiple splitting strategies
32
+ paragraphs = []
33
+
34
+ # Try splitting by double newlines first
35
+ if '\n\n' in content:
36
+ paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
37
+ else:
38
+ # Try splitting by single newlines
39
+ paragraphs = [p.strip() for p in content.split('\n') if p.strip()]
40
+
41
+ # Filter by length
42
+ paragraphs = [p for p in paragraphs if len(p.strip()) > 10]
43
+
44
+ print(f"✅ Đã trích xuất {len(paragraphs)} đoạn văn từ file text")
45
+ return paragraphs
46
+
47
+ except UnicodeDecodeError:
48
+ # Try with different encoding
49
+ with open(file_path, 'r', encoding='latin-1') as f:
50
+ content = f.read()
51
+
52
+ paragraphs = [p.strip() for p in content.split('\n\n') if p.strip() and len(p.strip()) > 10]
53
+ print(f"✅ Đã trích xuất {len(paragraphs)} đoạn văn từ file text (latin-1)")
54
+ return paragraphs
55
 
56
  def _process_csv_file(self, file_path: str) -> List[str]:
57
  """Xử lý file CSV"""
58
+ print(f"📊 Đọc file CSV: {file_path}")
59
  try:
60
  df = pd.read_csv(file_path)
61
  documents = []
62
 
63
+ print(f"📋 CSV {len(df)} hàng và {len(df.columns)} cột")
64
+
65
+ for idx, row in df.iterrows():
66
  doc_parts = []
67
  for col in df.columns:
68
  if pd.notna(row[col]) and str(row[col]).strip():
69
  doc_parts.append(f"{col}: {row[col]}")
70
+
71
  if doc_parts:
72
+ full_doc = " | ".join(doc_parts)
73
+ if len(full_doc) > 10: # Ensure minimum length
74
+ documents.append(full_doc)
75
+
76
+ if idx < 3: # Log first few rows
77
+ print(f"📝 Hàng {idx}: {doc_parts}")
78
 
79
+ print(f"✅ Đã trích xuất {len(documents)} documents từ CSV")
80
  return documents
81
  except Exception as e:
82
+ print(f"❌ Lỗi đọc CSV: {traceback.format_exc()}")
83
  raise Exception(f"Lỗi đọc CSV: {str(e)}")
84
 
85
  def _process_json_file(self, file_path: str) -> List[str]:
86
  """Xử lý file JSON"""
87
+ print(f"📄 Đọc file JSON: {file_path}")
88
  try:
89
  with open(file_path, 'r', encoding='utf-8') as f:
90
  data = json.load(f)
 
96
  for key, value in obj.items():
97
  extract_text(value, f"{current_path}.{key}" if current_path else key)
98
  elif isinstance(obj, list):
99
+ for i, item in enumerate(obj):
100
+ extract_text(item, f"{current_path}[{i}]")
101
  elif isinstance(obj, str) and len(obj.strip()) > 10:
102
  documents.append(f"{current_path}: {obj.strip()}")
103
+ elif isinstance(obj, (int, float, bool)):
104
+ documents.append(f"{current_path}: {obj}")
105
 
106
  extract_text(data)
107
+ print(f"✅ Đã trích xuất {len(documents)} documents từ JSON")
108
  return documents
109
  except Exception as e:
110
+ print(f"❌ Lỗi đọc JSON: {traceback.format_exc()}")
111
  raise Exception(f"Lỗi đọc JSON: {str(e)}")