datbkpro commited on
Commit
5d11746
·
verified ·
1 Parent(s): a0335d2

Update core/wikipedia_processor.py

Browse files
Files changed (1) hide show
  1. core/wikipedia_processor.py +25 -133
core/wikipedia_processor.py CHANGED
@@ -2,181 +2,73 @@ import os
2
  import json
3
  import pandas as pd
4
  from typing import List
5
- import chardet # THÊM để detect encoding
6
- import traceback
7
 
8
  class WikipediaProcessor:
9
  def __init__(self):
10
  self.supported_formats = ['.txt', '.csv', '.json']
11
 
12
  def process_uploaded_file(self, file_path: str) -> List[str]:
13
- """Xử lý file uploaded với debug chi tiết"""
14
- print(f"🔄 Bắt đầu xử lý file: {file_path}")
15
-
16
- if not file_path or not os.path.exists(file_path):
17
- raise Exception(f"File không tồn tại: {file_path}")
18
-
19
  file_ext = os.path.splitext(file_path)[1].lower()
20
- print(f"📁 File extension: {file_ext}")
21
 
22
  try:
23
  if file_ext == '.txt':
24
- documents = self._process_txt_file(file_path)
25
  elif file_ext == '.csv':
26
- documents = self._process_csv_file(file_path)
27
  elif file_ext == '.json':
28
- documents = self._process_json_file(file_path)
29
  else:
30
  raise ValueError(f"Định dạng file không được hỗ trợ: {file_ext}")
31
-
32
- print(f"✅ Đã xử lý được {len(documents)} documents từ file")
33
- return documents
34
-
35
  except Exception as e:
36
- error_msg = f"Lỗi xử lý file {file_path}: {str(e)}"
37
- print(f"❌ {error_msg}")
38
- print(f"DEBUG: {traceback.format_exc()}")
39
- raise Exception(error_msg)
40
 
41
  def _process_txt_file(self, file_path: str) -> List[str]:
42
- """Xử lý file text với encoding detection"""
43
- try:
44
- # Detect encoding
45
- with open(file_path, 'rb') as f:
46
- raw_data = f.read(10000) # Read first 10KB to detect encoding
47
- encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
48
-
49
- print(f"📝 Detected encoding: {encoding}")
50
-
51
- # Read with detected encoding
52
- with open(file_path, 'r', encoding=encoding, errors='replace') as f:
53
- content = f.read()
54
-
55
- if not content.strip():
56
- raise Exception("File trống hoặc không có nội dung")
57
-
58
- # Multiple ways to split content
59
- paragraphs = []
60
-
61
- # Method 1: Split by double newlines
62
- paragraphs1 = [p.strip() for p in content.split('\n\n') if p.strip() and len(p.strip()) > 10]
63
-
64
- # Method 2: Split by single newlines (for line-by-line files)
65
- paragraphs2 = [p.strip() for p in content.split('\n') if p.strip() and len(p.strip()) > 10]
66
-
67
- # Method 3: Split by sentences (for long continuous text)
68
- import re
69
- sentences = re.split(r'[.!?]+', content)
70
- paragraphs3 = [s.strip() for s in sentences if s.strip() and len(s.strip()) > 10]
71
-
72
- # Combine all methods, remove duplicates
73
- all_paragraphs = paragraphs1 + paragraphs2 + paragraphs3
74
- unique_paragraphs = []
75
- seen = set()
76
-
77
- for p in all_paragraphs:
78
- if p not in seen and len(p) > 10: # At least 10 characters
79
- seen.add(p)
80
- unique_paragraphs.append(p)
81
-
82
- print(f"📊 Text processing: {len(paragraphs1)} paragraphs, {len(paragraphs2)} lines, {len(paragraphs3)} sentences → {len(unique_paragraphs)} unique documents")
83
-
84
- if not unique_paragraphs:
85
- raise Exception("Không tìm thấy đoạn văn bản hợp lệ trong file")
86
-
87
- return unique_paragraphs[:100] # Limit to 100 documents to avoid overload
88
-
89
- except Exception as e:
90
- raise Exception(f"Lỗi đọc file text: {str(e)}")
91
 
92
  def _process_csv_file(self, file_path: str) -> List[str]:
93
- """Xử lý file CSV với error handling"""
94
  try:
95
- # Try different encodings
96
- encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1', 'utf-8-sig']
97
-
98
- df = None
99
- used_encoding = None
100
-
101
- for encoding in encodings:
102
- try:
103
- df = pd.read_csv(file_path, encoding=encoding)
104
- used_encoding = encoding
105
- print(f"✅ Đọc CSV thành công với encoding: {encoding}")
106
- break
107
- except (UnicodeDecodeError, pd.errors.EmptyDataError) as e:
108
- print(f"❌ Thất bại với encoding {encoding}: {e}")
109
- continue
110
-
111
- if df is None:
112
- raise Exception("Không thể đọc file CSV với các encoding thông dụng")
113
-
114
- if df.empty:
115
- raise Exception("File CSV trống")
116
-
117
- print(f"📊 CSV shape: {df.shape}")
118
- print(f"📊 CSV columns: {list(df.columns)}")
119
-
120
  documents = []
121
 
122
- # Process each row
123
- for idx, row in df.iterrows():
124
  doc_parts = []
125
  for col in df.columns:
126
  if pd.notna(row[col]) and str(row[col]).strip():
127
- value = str(row[col]).strip()
128
- if len(value) > 3: # Include shorter values for CSV
129
- doc_parts.append(f"{col}: {value}")
130
-
131
  if doc_parts:
132
- document = " | ".join(doc_parts)
133
- documents.append(document)
134
-
135
- print(f"✅ Đã tạo {len(documents)} documents từ {len(df)} rows")
136
 
137
- if not documents:
138
- raise Exception("Không tạo được documents từ dữ liệu CSV")
139
-
140
  return documents
141
-
142
  except Exception as e:
143
  raise Exception(f"Lỗi đọc CSV: {str(e)}")
144
 
145
  def _process_json_file(self, file_path: str) -> List[str]:
146
- """Xử lý file JSON với debug"""
147
  try:
148
  with open(file_path, 'r', encoding='utf-8') as f:
149
  data = json.load(f)
150
 
151
- print(f"📊 JSON data type: {type(data)}")
152
-
153
  documents = []
154
 
155
- def extract_text(obj, current_path="", depth=0):
156
- if depth > 10: # Prevent infinite recursion
157
- return
158
-
159
  if isinstance(obj, dict):
160
  for key, value in obj.items():
161
- new_path = f"{current_path}.{key}" if current_path else key
162
- extract_text(value, new_path, depth + 1)
163
  elif isinstance(obj, list):
164
- for i, item in enumerate(obj):
165
- extract_text(item, f"{current_path}[{i}]", depth + 1)
166
- elif isinstance(obj, str) and len(obj.strip()) > 5:
167
- clean_text = obj.strip()
168
- documents.append(f"{current_path}: {clean_text}")
169
- elif isinstance(obj, (int, float, bool)):
170
- documents.append(f"{current_path}: {obj}")
171
 
172
  extract_text(data)
173
-
174
- print(f"✅ Đã trích xuất {len(documents)} documents từ JSON")
175
-
176
- if not documents:
177
- raise Exception("Không trích xuất được nội dung từ JSON")
178
-
179
- return documents[:100] # Limit to avoid overload
180
-
181
  except Exception as e:
182
  raise Exception(f"Lỗi đọc JSON: {str(e)}")
 
2
  import json
3
  import pandas as pd
4
  from typing import List
5
+ from models.schemas import RAGDocument
 
6
 
7
  class WikipediaProcessor:
8
  def __init__(self):
9
  self.supported_formats = ['.txt', '.csv', '.json']
10
 
11
  def process_uploaded_file(self, file_path: str) -> List[str]:
12
+ """Xử lý file Wikipedia uploaded"""
 
 
 
 
 
13
  file_ext = os.path.splitext(file_path)[1].lower()
 
14
 
15
  try:
16
  if file_ext == '.txt':
17
+ return self._process_txt_file(file_path)
18
  elif file_ext == '.csv':
19
+ return self._process_csv_file(file_path)
20
  elif file_ext == '.json':
21
+ return self._process_json_file(file_path)
22
  else:
23
  raise ValueError(f"Định dạng file không được hỗ trợ: {file_ext}")
 
 
 
 
24
  except Exception as e:
25
+ raise Exception(f"Lỗi xử lý file: {str(e)}")
 
 
 
26
 
27
  def _process_txt_file(self, file_path: str) -> List[str]:
28
+ """Xử lý file text"""
29
+ with open(file_path, 'r', encoding='utf-8') as f:
30
+ content = f.read()
31
+
32
+ paragraphs = [p.strip() for p in content.split('\n\n') if p.strip() and len(p.strip()) > 20]
33
+ return paragraphs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  def _process_csv_file(self, file_path: str) -> List[str]:
36
+ """Xử lý file CSV"""
37
  try:
38
+ df = pd.read_csv(file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  documents = []
40
 
41
+ for _, row in df.iterrows():
 
42
  doc_parts = []
43
  for col in df.columns:
44
  if pd.notna(row[col]) and str(row[col]).strip():
45
+ doc_parts.append(f"{col}: {row[col]}")
 
 
 
46
  if doc_parts:
47
+ documents.append(" | ".join(doc_parts))
 
 
 
48
 
 
 
 
49
  return documents
 
50
  except Exception as e:
51
  raise Exception(f"Lỗi đọc CSV: {str(e)}")
52
 
53
  def _process_json_file(self, file_path: str) -> List[str]:
54
+ """Xử lý file JSON"""
55
  try:
56
  with open(file_path, 'r', encoding='utf-8') as f:
57
  data = json.load(f)
58
 
 
 
59
  documents = []
60
 
61
+ def extract_text(obj, current_path=""):
 
 
 
62
  if isinstance(obj, dict):
63
  for key, value in obj.items():
64
+ extract_text(value, f"{current_path}.{key}" if current_path else key)
 
65
  elif isinstance(obj, list):
66
+ for item in obj:
67
+ extract_text(item, current_path)
68
+ elif isinstance(obj, str) and len(obj.strip()) > 10:
69
+ documents.append(f"{current_path}: {obj.strip()}")
 
 
 
70
 
71
  extract_text(data)
72
+ return documents
 
 
 
 
 
 
 
73
  except Exception as e:
74
  raise Exception(f"Lỗi đọc JSON: {str(e)}")