Spaces:

datbkpro
/

voicebot

Sleeping

App Files Files Community

datbkpro commited on Oct 21, 2025

Commit

b846ef6

verified ·

1 Parent(s): 7c433d8

Update core/wikipedia_processor.py

Browse files

Files changed (1) hide show

core/wikipedia_processor.py +133 -25

core/wikipedia_processor.py CHANGED Viewed

@@ -2,73 +2,181 @@ import os
 import json
 import pandas as pd
 from typing import List
-from models.schemas import RAGDocument
 class WikipediaProcessor:
     def __init__(self):
         self.supported_formats = ['.txt', '.csv', '.json']
     def process_uploaded_file(self, file_path: str) -> List[str]:
-        """Xử lý file Wikipedia uploaded"""
         file_ext = os.path.splitext(file_path)[1].lower()
         try:
             if file_ext == '.txt':
-                return self._process_txt_file(file_path)
             elif file_ext == '.csv':
-                return self._process_csv_file(file_path)
             elif file_ext == '.json':
-                return self._process_json_file(file_path)
             else:
                 raise ValueError(f"Định dạng file không được hỗ trợ: {file_ext}")
         except Exception as e:
-            raise Exception(f"Lỗi xử lý file: {str(e)}")
     def _process_txt_file(self, file_path: str) -> List[str]:
-        """Xử lý file text"""
-        with open(file_path, 'r', encoding='utf-8') as f:
-            content = f.read()
-        paragraphs = [p.strip() for p in content.split('\n\n') if p.strip() and len(p.strip()) > 20]
-        return paragraphs
     def _process_csv_file(self, file_path: str) -> List[str]:
-        """Xử lý file CSV"""
         try:
-            df = pd.read_csv(file_path)
             documents = []
-            for _, row in df.iterrows():
                 doc_parts = []
                 for col in df.columns:
                     if pd.notna(row[col]) and str(row[col]).strip():
-                        doc_parts.append(f"{col}: {row[col]}")
                 if doc_parts:
-                    documents.append(" | ".join(doc_parts))
             return documents
         except Exception as e:
             raise Exception(f"Lỗi đọc CSV: {str(e)}")
     def _process_json_file(self, file_path: str) -> List[str]:
-        """Xử lý file JSON"""
         try:
             with open(file_path, 'r', encoding='utf-8') as f:
                 data = json.load(f)
             documents = []
-            def extract_text(obj, current_path=""):
                 if isinstance(obj, dict):
                     for key, value in obj.items():
-                        extract_text(value, f"{current_path}.{key}" if current_path else key)
                 elif isinstance(obj, list):
-                    for item in obj:
-                        extract_text(item, current_path)
-                elif isinstance(obj, str) and len(obj.strip()) > 10:
-                    documents.append(f"{current_path}: {obj.strip()}")
             extract_text(data)
-            return documents
         except Exception as e:
             raise Exception(f"Lỗi đọc JSON: {str(e)}")

 import json
 import pandas as pd
 from typing import List
+import chardet  # THÊM để detect encoding
+import traceback
 class WikipediaProcessor:
     def __init__(self):
         self.supported_formats = ['.txt', '.csv', '.json']
     def process_uploaded_file(self, file_path: str) -> List[str]:
+        """Xử lý file uploaded với debug chi tiết"""
+        print(f"🔄 Bắt đầu xử lý file: {file_path}")
+        if not file_path or not os.path.exists(file_path):
+            raise Exception(f"File không tồn tại: {file_path}")
         file_ext = os.path.splitext(file_path)[1].lower()
+        print(f"📁 File extension: {file_ext}")
         try:
             if file_ext == '.txt':
+                documents = self._process_txt_file(file_path)
             elif file_ext == '.csv':
+                documents = self._process_csv_file(file_path)
             elif file_ext == '.json':
+                documents = self._process_json_file(file_path)
             else:
                 raise ValueError(f"Định dạng file không được hỗ trợ: {file_ext}")
+            print(f"✅ Đã xử lý được {len(documents)} documents từ file")
+            return documents
         except Exception as e:
+            error_msg = f"Lỗi xử lý file {file_path}: {str(e)}"
+            print(f"❌ {error_msg}")
+            print(f"DEBUG: {traceback.format_exc()}")
+            raise Exception(error_msg)
     def _process_txt_file(self, file_path: str) -> List[str]:
+        """Xử lý file text với encoding detection"""
+        try:
+            # Detect encoding
+            with open(file_path, 'rb') as f:
+                raw_data = f.read(10000)  # Read first 10KB to detect encoding
+                encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
+            print(f"📝 Detected encoding: {encoding}")
+            # Read with detected encoding
+            with open(file_path, 'r', encoding=encoding, errors='replace') as f:
+                content = f.read()
+            if not content.strip():
+                raise Exception("File trống hoặc không có nội dung")
+            # Multiple ways to split content
+            paragraphs = []
+            # Method 1: Split by double newlines
+            paragraphs1 = [p.strip() for p in content.split('\n\n') if p.strip() and len(p.strip()) > 10]
+            # Method 2: Split by single newlines (for line-by-line files)
+            paragraphs2 = [p.strip() for p in content.split('\n') if p.strip() and len(p.strip()) > 10]
+            # Method 3: Split by sentences (for long continuous text)
+            import re
+            sentences = re.split(r'[.!?]+', content)
+            paragraphs3 = [s.strip() for s in sentences if s.strip() and len(s.strip()) > 10]
+            # Combine all methods, remove duplicates
+            all_paragraphs = paragraphs1 + paragraphs2 + paragraphs3
+            unique_paragraphs = []
+            seen = set()
+            for p in all_paragraphs:
+                if p not in seen and len(p) > 10:  # At least 10 characters
+                    seen.add(p)
+                    unique_paragraphs.append(p)
+            print(f"📊 Text processing: {len(paragraphs1)} paragraphs, {len(paragraphs2)} lines, {len(paragraphs3)} sentences → {len(unique_paragraphs)} unique documents")
+            if not unique_paragraphs:
+                raise Exception("Không tìm thấy đoạn văn bản hợp lệ trong file")
+            return unique_paragraphs[:100]  # Limit to 100 documents to avoid overload
+        except Exception as e:
+            raise Exception(f"Lỗi đọc file text: {str(e)}")
     def _process_csv_file(self, file_path: str) -> List[str]:
+        """Xử lý file CSV với error handling"""
         try:
+            # Try different encodings
+            encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1', 'utf-8-sig']
+            df = None
+            used_encoding = None
+            for encoding in encodings:
+                try:
+                    df = pd.read_csv(file_path, encoding=encoding)
+                    used_encoding = encoding
+                    print(f"✅ Đọc CSV thành công với encoding: {encoding}")
+                    break
+                except (UnicodeDecodeError, pd.errors.EmptyDataError) as e:
+                    print(f"❌ Thất bại với encoding {encoding}: {e}")
+                    continue
+            if df is None:
+                raise Exception("Không thể đọc file CSV với các encoding thông dụng")
+            if df.empty:
+                raise Exception("File CSV trống")
+            print(f"📊 CSV shape: {df.shape}")
+            print(f"📊 CSV columns: {list(df.columns)}")
             documents = []
+            # Process each row
+            for idx, row in df.iterrows():
                 doc_parts = []
                 for col in df.columns:
                     if pd.notna(row[col]) and str(row[col]).strip():
+                        value = str(row[col]).strip()
+                        if len(value) > 3:  # Include shorter values for CSV
+                            doc_parts.append(f"{col}: {value}")
                 if doc_parts:
+                    document = " | ".join(doc_parts)
+                    documents.append(document)
+            print(f"✅ Đã tạo {len(documents)} documents từ {len(df)} rows")
+            if not documents:
+                raise Exception("Không tạo được documents từ dữ liệu CSV")
             return documents
         except Exception as e:
             raise Exception(f"Lỗi đọc CSV: {str(e)}")
     def _process_json_file(self, file_path: str) -> List[str]:
+        """Xử lý file JSON với debug"""
         try:
             with open(file_path, 'r', encoding='utf-8') as f:
                 data = json.load(f)
+            print(f"📊 JSON data type: {type(data)}")
             documents = []
+            def extract_text(obj, current_path="", depth=0):
+                if depth > 10:  # Prevent infinite recursion
+                    return
                 if isinstance(obj, dict):
                     for key, value in obj.items():
+                        new_path = f"{current_path}.{key}" if current_path else key
+                        extract_text(value, new_path, depth + 1)
                 elif isinstance(obj, list):
+                    for i, item in enumerate(obj):
+                        extract_text(item, f"{current_path}[{i}]", depth + 1)
+                elif isinstance(obj, str) and len(obj.strip()) > 5:
+                    clean_text = obj.strip()
+                    documents.append(f"{current_path}: {clean_text}")
+                elif isinstance(obj, (int, float, bool)):
+                    documents.append(f"{current_path}: {obj}")
             extract_text(data)
+            print(f"✅ Đã trích xuất {len(documents)} documents từ JSON")
+            if not documents:
+                raise Exception("Không trích xuất được nội dung từ JSON")
+            return documents[:100]  # Limit to avoid overload
         except Exception as e:
             raise Exception(f"Lỗi đọc JSON: {str(e)}")