datbkpro commited on
Commit
b846ef6
·
verified ·
1 Parent(s): 7c433d8

Update core/wikipedia_processor.py

Browse files
Files changed (1) hide show
  1. core/wikipedia_processor.py +133 -25
core/wikipedia_processor.py CHANGED
@@ -2,73 +2,181 @@ import os
2
  import json
3
  import pandas as pd
4
  from typing import List
5
- from models.schemas import RAGDocument
 
6
 
7
  class WikipediaProcessor:
8
  def __init__(self):
9
  self.supported_formats = ['.txt', '.csv', '.json']
10
 
11
  def process_uploaded_file(self, file_path: str) -> List[str]:
12
- """Xử lý file Wikipedia uploaded"""
 
 
 
 
 
13
  file_ext = os.path.splitext(file_path)[1].lower()
 
14
 
15
  try:
16
  if file_ext == '.txt':
17
- return self._process_txt_file(file_path)
18
  elif file_ext == '.csv':
19
- return self._process_csv_file(file_path)
20
  elif file_ext == '.json':
21
- return self._process_json_file(file_path)
22
  else:
23
  raise ValueError(f"Định dạng file không được hỗ trợ: {file_ext}")
 
 
 
 
24
  except Exception as e:
25
- raise Exception(f"Lỗi xử lý file: {str(e)}")
 
 
 
26
 
27
  def _process_txt_file(self, file_path: str) -> List[str]:
28
- """Xử lý file text"""
29
- with open(file_path, 'r', encoding='utf-8') as f:
30
- content = f.read()
31
-
32
- paragraphs = [p.strip() for p in content.split('\n\n') if p.strip() and len(p.strip()) > 20]
33
- return paragraphs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  def _process_csv_file(self, file_path: str) -> List[str]:
36
- """Xử lý file CSV"""
37
  try:
38
- df = pd.read_csv(file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  documents = []
40
 
41
- for _, row in df.iterrows():
 
42
  doc_parts = []
43
  for col in df.columns:
44
  if pd.notna(row[col]) and str(row[col]).strip():
45
- doc_parts.append(f"{col}: {row[col]}")
 
 
 
46
  if doc_parts:
47
- documents.append(" | ".join(doc_parts))
 
 
 
48
 
 
 
 
49
  return documents
 
50
  except Exception as e:
51
  raise Exception(f"Lỗi đọc CSV: {str(e)}")
52
 
53
  def _process_json_file(self, file_path: str) -> List[str]:
54
- """Xử lý file JSON"""
55
  try:
56
  with open(file_path, 'r', encoding='utf-8') as f:
57
  data = json.load(f)
58
 
 
 
59
  documents = []
60
 
61
- def extract_text(obj, current_path=""):
 
 
 
62
  if isinstance(obj, dict):
63
  for key, value in obj.items():
64
- extract_text(value, f"{current_path}.{key}" if current_path else key)
 
65
  elif isinstance(obj, list):
66
- for item in obj:
67
- extract_text(item, current_path)
68
- elif isinstance(obj, str) and len(obj.strip()) > 10:
69
- documents.append(f"{current_path}: {obj.strip()}")
 
 
 
70
 
71
  extract_text(data)
72
- return documents
 
 
 
 
 
 
 
73
  except Exception as e:
74
  raise Exception(f"Lỗi đọc JSON: {str(e)}")
 
2
  import json
3
  import pandas as pd
4
  from typing import List
5
+ import chardet # THÊM để detect encoding
6
+ import traceback
7
 
8
  class WikipediaProcessor:
9
  def __init__(self):
10
  self.supported_formats = ['.txt', '.csv', '.json']
11
 
12
  def process_uploaded_file(self, file_path: str) -> List[str]:
13
+ """Xử lý file uploaded với debug chi tiết"""
14
+ print(f"🔄 Bắt đầu xử lý file: {file_path}")
15
+
16
+ if not file_path or not os.path.exists(file_path):
17
+ raise Exception(f"File không tồn tại: {file_path}")
18
+
19
  file_ext = os.path.splitext(file_path)[1].lower()
20
+ print(f"📁 File extension: {file_ext}")
21
 
22
  try:
23
  if file_ext == '.txt':
24
+ documents = self._process_txt_file(file_path)
25
  elif file_ext == '.csv':
26
+ documents = self._process_csv_file(file_path)
27
  elif file_ext == '.json':
28
+ documents = self._process_json_file(file_path)
29
  else:
30
  raise ValueError(f"Định dạng file không được hỗ trợ: {file_ext}")
31
+
32
+ print(f"✅ Đã xử lý được {len(documents)} documents từ file")
33
+ return documents
34
+
35
  except Exception as e:
36
+ error_msg = f"Lỗi xử lý file {file_path}: {str(e)}"
37
+ print(f"❌ {error_msg}")
38
+ print(f"DEBUG: {traceback.format_exc()}")
39
+ raise Exception(error_msg)
40
 
41
  def _process_txt_file(self, file_path: str) -> List[str]:
42
+ """Xử lý file text với encoding detection"""
43
+ try:
44
+ # Detect encoding
45
+ with open(file_path, 'rb') as f:
46
+ raw_data = f.read(10000) # Read first 10KB to detect encoding
47
+ encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
48
+
49
+ print(f"📝 Detected encoding: {encoding}")
50
+
51
+ # Read with detected encoding
52
+ with open(file_path, 'r', encoding=encoding, errors='replace') as f:
53
+ content = f.read()
54
+
55
+ if not content.strip():
56
+ raise Exception("File trống hoặc không có nội dung")
57
+
58
+ # Multiple ways to split content
59
+ paragraphs = []
60
+
61
+ # Method 1: Split by double newlines
62
+ paragraphs1 = [p.strip() for p in content.split('\n\n') if p.strip() and len(p.strip()) > 10]
63
+
64
+ # Method 2: Split by single newlines (for line-by-line files)
65
+ paragraphs2 = [p.strip() for p in content.split('\n') if p.strip() and len(p.strip()) > 10]
66
+
67
+ # Method 3: Split by sentences (for long continuous text)
68
+ import re
69
+ sentences = re.split(r'[.!?]+', content)
70
+ paragraphs3 = [s.strip() for s in sentences if s.strip() and len(s.strip()) > 10]
71
+
72
+ # Combine all methods, remove duplicates
73
+ all_paragraphs = paragraphs1 + paragraphs2 + paragraphs3
74
+ unique_paragraphs = []
75
+ seen = set()
76
+
77
+ for p in all_paragraphs:
78
+ if p not in seen and len(p) > 10: # At least 10 characters
79
+ seen.add(p)
80
+ unique_paragraphs.append(p)
81
+
82
+ print(f"📊 Text processing: {len(paragraphs1)} paragraphs, {len(paragraphs2)} lines, {len(paragraphs3)} sentences → {len(unique_paragraphs)} unique documents")
83
+
84
+ if not unique_paragraphs:
85
+ raise Exception("Không tìm thấy đoạn văn bản hợp lệ trong file")
86
+
87
+ return unique_paragraphs[:100] # Limit to 100 documents to avoid overload
88
+
89
+ except Exception as e:
90
+ raise Exception(f"Lỗi đọc file text: {str(e)}")
91
 
92
  def _process_csv_file(self, file_path: str) -> List[str]:
93
+ """Xử lý file CSV với error handling"""
94
  try:
95
+ # Try different encodings
96
+ encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1', 'utf-8-sig']
97
+
98
+ df = None
99
+ used_encoding = None
100
+
101
+ for encoding in encodings:
102
+ try:
103
+ df = pd.read_csv(file_path, encoding=encoding)
104
+ used_encoding = encoding
105
+ print(f"✅ Đọc CSV thành công với encoding: {encoding}")
106
+ break
107
+ except (UnicodeDecodeError, pd.errors.EmptyDataError) as e:
108
+ print(f"❌ Thất bại với encoding {encoding}: {e}")
109
+ continue
110
+
111
+ if df is None:
112
+ raise Exception("Không thể đọc file CSV với các encoding thông dụng")
113
+
114
+ if df.empty:
115
+ raise Exception("File CSV trống")
116
+
117
+ print(f"📊 CSV shape: {df.shape}")
118
+ print(f"📊 CSV columns: {list(df.columns)}")
119
+
120
  documents = []
121
 
122
+ # Process each row
123
+ for idx, row in df.iterrows():
124
  doc_parts = []
125
  for col in df.columns:
126
  if pd.notna(row[col]) and str(row[col]).strip():
127
+ value = str(row[col]).strip()
128
+ if len(value) > 3: # Include shorter values for CSV
129
+ doc_parts.append(f"{col}: {value}")
130
+
131
  if doc_parts:
132
+ document = " | ".join(doc_parts)
133
+ documents.append(document)
134
+
135
+ print(f"✅ Đã tạo {len(documents)} documents từ {len(df)} rows")
136
 
137
+ if not documents:
138
+ raise Exception("Không tạo được documents từ dữ liệu CSV")
139
+
140
  return documents
141
+
142
  except Exception as e:
143
  raise Exception(f"Lỗi đọc CSV: {str(e)}")
144
 
145
  def _process_json_file(self, file_path: str) -> List[str]:
146
+ """Xử lý file JSON với debug"""
147
  try:
148
  with open(file_path, 'r', encoding='utf-8') as f:
149
  data = json.load(f)
150
 
151
+ print(f"📊 JSON data type: {type(data)}")
152
+
153
  documents = []
154
 
155
+ def extract_text(obj, current_path="", depth=0):
156
+ if depth > 10: # Prevent infinite recursion
157
+ return
158
+
159
  if isinstance(obj, dict):
160
  for key, value in obj.items():
161
+ new_path = f"{current_path}.{key}" if current_path else key
162
+ extract_text(value, new_path, depth + 1)
163
  elif isinstance(obj, list):
164
+ for i, item in enumerate(obj):
165
+ extract_text(item, f"{current_path}[{i}]", depth + 1)
166
+ elif isinstance(obj, str) and len(obj.strip()) > 5:
167
+ clean_text = obj.strip()
168
+ documents.append(f"{current_path}: {clean_text}")
169
+ elif isinstance(obj, (int, float, bool)):
170
+ documents.append(f"{current_path}: {obj}")
171
 
172
  extract_text(data)
173
+
174
+ print(f"✅ Đã trích xuất {len(documents)} documents từ JSON")
175
+
176
+ if not documents:
177
+ raise Exception("Không trích xuất được nội dung từ JSON")
178
+
179
+ return documents[:100] # Limit to avoid overload
180
+
181
  except Exception as e:
182
  raise Exception(f"Lỗi đọc JSON: {str(e)}")