Update core/wikipedia_processor.py
Browse files- core/wikipedia_processor.py +52 -15
core/wikipedia_processor.py
CHANGED
|
@@ -1,9 +1,3 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import json
|
| 3 |
-
import pandas as pd
|
| 4 |
-
from typing import List
|
| 5 |
-
from models.schemas import RAGDocument
|
| 6 |
-
|
| 7 |
class WikipediaProcessor:
|
| 8 |
def __init__(self):
|
| 9 |
self.supported_formats = ['.txt', '.csv', '.json']
|
|
@@ -13,6 +7,8 @@ class WikipediaProcessor:
|
|
| 13 |
file_ext = os.path.splitext(file_path)[1].lower()
|
| 14 |
|
| 15 |
try:
|
|
|
|
|
|
|
| 16 |
if file_ext == '.txt':
|
| 17 |
return self._process_txt_file(file_path)
|
| 18 |
elif file_ext == '.csv':
|
|
@@ -22,36 +18,73 @@ class WikipediaProcessor:
|
|
| 22 |
else:
|
| 23 |
raise ValueError(f"Định dạng file không được hỗ trợ: {file_ext}")
|
| 24 |
except Exception as e:
|
|
|
|
| 25 |
raise Exception(f"Lỗi xử lý file: {str(e)}")
|
| 26 |
|
| 27 |
def _process_txt_file(self, file_path: str) -> List[str]:
|
| 28 |
"""Xử lý file text"""
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
def _process_csv_file(self, file_path: str) -> List[str]:
|
| 36 |
"""Xử lý file CSV"""
|
|
|
|
| 37 |
try:
|
| 38 |
df = pd.read_csv(file_path)
|
| 39 |
documents = []
|
| 40 |
|
| 41 |
-
|
|
|
|
|
|
|
| 42 |
doc_parts = []
|
| 43 |
for col in df.columns:
|
| 44 |
if pd.notna(row[col]) and str(row[col]).strip():
|
| 45 |
doc_parts.append(f"{col}: {row[col]}")
|
|
|
|
| 46 |
if doc_parts:
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
|
|
|
| 49 |
return documents
|
| 50 |
except Exception as e:
|
|
|
|
| 51 |
raise Exception(f"Lỗi đọc CSV: {str(e)}")
|
| 52 |
|
| 53 |
def _process_json_file(self, file_path: str) -> List[str]:
|
| 54 |
"""Xử lý file JSON"""
|
|
|
|
| 55 |
try:
|
| 56 |
with open(file_path, 'r', encoding='utf-8') as f:
|
| 57 |
data = json.load(f)
|
|
@@ -63,12 +96,16 @@ class WikipediaProcessor:
|
|
| 63 |
for key, value in obj.items():
|
| 64 |
extract_text(value, f"{current_path}.{key}" if current_path else key)
|
| 65 |
elif isinstance(obj, list):
|
| 66 |
-
for item in obj:
|
| 67 |
-
extract_text(item, current_path)
|
| 68 |
elif isinstance(obj, str) and len(obj.strip()) > 10:
|
| 69 |
documents.append(f"{current_path}: {obj.strip()}")
|
|
|
|
|
|
|
| 70 |
|
| 71 |
extract_text(data)
|
|
|
|
| 72 |
return documents
|
| 73 |
except Exception as e:
|
|
|
|
| 74 |
raise Exception(f"Lỗi đọc JSON: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
class WikipediaProcessor:
|
| 2 |
def __init__(self):
|
| 3 |
self.supported_formats = ['.txt', '.csv', '.json']
|
|
|
|
| 7 |
file_ext = os.path.splitext(file_path)[1].lower()
|
| 8 |
|
| 9 |
try:
|
| 10 |
+
print(f"🔄 Đang xử lý file: {file_path}, định dạng: {file_ext}")
|
| 11 |
+
|
| 12 |
if file_ext == '.txt':
|
| 13 |
return self._process_txt_file(file_path)
|
| 14 |
elif file_ext == '.csv':
|
|
|
|
| 18 |
else:
|
| 19 |
raise ValueError(f"Định dạng file không được hỗ trợ: {file_ext}")
|
| 20 |
except Exception as e:
|
| 21 |
+
print(f"❌ Lỗi xử lý file: {traceback.format_exc()}")
|
| 22 |
raise Exception(f"Lỗi xử lý file: {str(e)}")
|
| 23 |
|
| 24 |
def _process_txt_file(self, file_path: str) -> List[str]:
|
| 25 |
"""Xử lý file text"""
|
| 26 |
+
print(f"📖 Đọc file text: {file_path}")
|
| 27 |
+
try:
|
| 28 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 29 |
+
content = f.read()
|
| 30 |
+
|
| 31 |
+
# Multiple splitting strategies
|
| 32 |
+
paragraphs = []
|
| 33 |
+
|
| 34 |
+
# Try splitting by double newlines first
|
| 35 |
+
if '\n\n' in content:
|
| 36 |
+
paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
|
| 37 |
+
else:
|
| 38 |
+
# Try splitting by single newlines
|
| 39 |
+
paragraphs = [p.strip() for p in content.split('\n') if p.strip()]
|
| 40 |
+
|
| 41 |
+
# Filter by length
|
| 42 |
+
paragraphs = [p for p in paragraphs if len(p.strip()) > 10]
|
| 43 |
+
|
| 44 |
+
print(f"✅ Đã trích xuất {len(paragraphs)} đoạn văn từ file text")
|
| 45 |
+
return paragraphs
|
| 46 |
+
|
| 47 |
+
except UnicodeDecodeError:
|
| 48 |
+
# Try with different encoding
|
| 49 |
+
with open(file_path, 'r', encoding='latin-1') as f:
|
| 50 |
+
content = f.read()
|
| 51 |
+
|
| 52 |
+
paragraphs = [p.strip() for p in content.split('\n\n') if p.strip() and len(p.strip()) > 10]
|
| 53 |
+
print(f"✅ Đã trích xuất {len(paragraphs)} đoạn văn từ file text (latin-1)")
|
| 54 |
+
return paragraphs
|
| 55 |
|
| 56 |
def _process_csv_file(self, file_path: str) -> List[str]:
|
| 57 |
"""Xử lý file CSV"""
|
| 58 |
+
print(f"📊 Đọc file CSV: {file_path}")
|
| 59 |
try:
|
| 60 |
df = pd.read_csv(file_path)
|
| 61 |
documents = []
|
| 62 |
|
| 63 |
+
print(f"📋 CSV có {len(df)} hàng và {len(df.columns)} cột")
|
| 64 |
+
|
| 65 |
+
for idx, row in df.iterrows():
|
| 66 |
doc_parts = []
|
| 67 |
for col in df.columns:
|
| 68 |
if pd.notna(row[col]) and str(row[col]).strip():
|
| 69 |
doc_parts.append(f"{col}: {row[col]}")
|
| 70 |
+
|
| 71 |
if doc_parts:
|
| 72 |
+
full_doc = " | ".join(doc_parts)
|
| 73 |
+
if len(full_doc) > 10: # Ensure minimum length
|
| 74 |
+
documents.append(full_doc)
|
| 75 |
+
|
| 76 |
+
if idx < 3: # Log first few rows
|
| 77 |
+
print(f"📝 Hàng {idx}: {doc_parts}")
|
| 78 |
|
| 79 |
+
print(f"✅ Đã trích xuất {len(documents)} documents từ CSV")
|
| 80 |
return documents
|
| 81 |
except Exception as e:
|
| 82 |
+
print(f"❌ Lỗi đọc CSV: {traceback.format_exc()}")
|
| 83 |
raise Exception(f"Lỗi đọc CSV: {str(e)}")
|
| 84 |
|
| 85 |
def _process_json_file(self, file_path: str) -> List[str]:
|
| 86 |
"""Xử lý file JSON"""
|
| 87 |
+
print(f"📄 Đọc file JSON: {file_path}")
|
| 88 |
try:
|
| 89 |
with open(file_path, 'r', encoding='utf-8') as f:
|
| 90 |
data = json.load(f)
|
|
|
|
| 96 |
for key, value in obj.items():
|
| 97 |
extract_text(value, f"{current_path}.{key}" if current_path else key)
|
| 98 |
elif isinstance(obj, list):
|
| 99 |
+
for i, item in enumerate(obj):
|
| 100 |
+
extract_text(item, f"{current_path}[{i}]")
|
| 101 |
elif isinstance(obj, str) and len(obj.strip()) > 10:
|
| 102 |
documents.append(f"{current_path}: {obj.strip()}")
|
| 103 |
+
elif isinstance(obj, (int, float, bool)):
|
| 104 |
+
documents.append(f"{current_path}: {obj}")
|
| 105 |
|
| 106 |
extract_text(data)
|
| 107 |
+
print(f"✅ Đã trích xuất {len(documents)} documents từ JSON")
|
| 108 |
return documents
|
| 109 |
except Exception as e:
|
| 110 |
+
print(f"❌ Lỗi đọc JSON: {traceback.format_exc()}")
|
| 111 |
raise Exception(f"Lỗi đọc JSON: {str(e)}")
|