Spaces:

datbkpro
/

voicebot

Sleeping

App Files Files Community

datbkpro commited on Oct 21, 2025

Commit

5d11746

verified ·

1 Parent(s): a0335d2

Update core/wikipedia_processor.py

Browse files

Files changed (1) hide show

core/wikipedia_processor.py +25 -133

core/wikipedia_processor.py CHANGED Viewed

@@ -2,181 +2,73 @@ import os
 import json
 import pandas as pd
 from typing import List
-import chardet  # THÊM để detect encoding
-import traceback
 class WikipediaProcessor:
     def __init__(self):
         self.supported_formats = ['.txt', '.csv', '.json']
     def process_uploaded_file(self, file_path: str) -> List[str]:
-        """Xử lý file uploaded với debug chi tiết"""
-        print(f"🔄 Bắt đầu xử lý file: {file_path}")
-        if not file_path or not os.path.exists(file_path):
-            raise Exception(f"File không tồn tại: {file_path}")
         file_ext = os.path.splitext(file_path)[1].lower()
-        print(f"📁 File extension: {file_ext}")
         try:
             if file_ext == '.txt':
-                documents = self._process_txt_file(file_path)
             elif file_ext == '.csv':
-                documents = self._process_csv_file(file_path)
             elif file_ext == '.json':
-                documents = self._process_json_file(file_path)
             else:
                 raise ValueError(f"Định dạng file không được hỗ trợ: {file_ext}")
-            print(f"✅ Đã xử lý được {len(documents)} documents từ file")
-            return documents
         except Exception as e:
-            error_msg = f"Lỗi xử lý file {file_path}: {str(e)}"
-            print(f"❌ {error_msg}")
-            print(f"DEBUG: {traceback.format_exc()}")
-            raise Exception(error_msg)
     def _process_txt_file(self, file_path: str) -> List[str]:
-        """Xử lý file text với encoding detection"""
-        try:
-            # Detect encoding
-            with open(file_path, 'rb') as f:
-                raw_data = f.read(10000)  # Read first 10KB to detect encoding
-                encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
-            print(f"📝 Detected encoding: {encoding}")
-            # Read with detected encoding
-            with open(file_path, 'r', encoding=encoding, errors='replace') as f:
-                content = f.read()
-            if not content.strip():
-                raise Exception("File trống hoặc không có nội dung")
-            # Multiple ways to split content
-            paragraphs = []
-            # Method 1: Split by double newlines
-            paragraphs1 = [p.strip() for p in content.split('\n\n') if p.strip() and len(p.strip()) > 10]
-            # Method 2: Split by single newlines (for line-by-line files)
-            paragraphs2 = [p.strip() for p in content.split('\n') if p.strip() and len(p.strip()) > 10]
-            # Method 3: Split by sentences (for long continuous text)
-            import re
-            sentences = re.split(r'[.!?]+', content)
-            paragraphs3 = [s.strip() for s in sentences if s.strip() and len(s.strip()) > 10]
-            # Combine all methods, remove duplicates
-            all_paragraphs = paragraphs1 + paragraphs2 + paragraphs3
-            unique_paragraphs = []
-            seen = set()
-            for p in all_paragraphs:
-                if p not in seen and len(p) > 10:  # At least 10 characters
-                    seen.add(p)
-                    unique_paragraphs.append(p)
-            print(f"📊 Text processing: {len(paragraphs1)} paragraphs, {len(paragraphs2)} lines, {len(paragraphs3)} sentences → {len(unique_paragraphs)} unique documents")
-            if not unique_paragraphs:
-                raise Exception("Không tìm thấy đoạn văn bản hợp lệ trong file")
-            return unique_paragraphs[:100]  # Limit to 100 documents to avoid overload
-        except Exception as e:
-            raise Exception(f"Lỗi đọc file text: {str(e)}")
     def _process_csv_file(self, file_path: str) -> List[str]:
-        """Xử lý file CSV với error handling"""
         try:
-            # Try different encodings
-            encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1', 'utf-8-sig']
-            df = None
-            used_encoding = None
-            for encoding in encodings:
-                try:
-                    df = pd.read_csv(file_path, encoding=encoding)
-                    used_encoding = encoding
-                    print(f"✅ Đọc CSV thành công với encoding: {encoding}")
-                    break
-                except (UnicodeDecodeError, pd.errors.EmptyDataError) as e:
-                    print(f"❌ Thất bại với encoding {encoding}: {e}")
-                    continue
-            if df is None:
-                raise Exception("Không thể đọc file CSV với các encoding thông dụng")
-            if df.empty:
-                raise Exception("File CSV trống")
-            print(f"📊 CSV shape: {df.shape}")
-            print(f"📊 CSV columns: {list(df.columns)}")
             documents = []
-            # Process each row
-            for idx, row in df.iterrows():
                 doc_parts = []
                 for col in df.columns:
                     if pd.notna(row[col]) and str(row[col]).strip():
-                        value = str(row[col]).strip()
-                        if len(value) > 3:  # Include shorter values for CSV
-                            doc_parts.append(f"{col}: {value}")
                 if doc_parts:
-                    document = " | ".join(doc_parts)
-                    documents.append(document)
-            print(f"✅ Đã tạo {len(documents)} documents từ {len(df)} rows")
-            if not documents:
-                raise Exception("Không tạo được documents từ dữ liệu CSV")
             return documents
         except Exception as e:
             raise Exception(f"Lỗi đọc CSV: {str(e)}")
     def _process_json_file(self, file_path: str) -> List[str]:
-        """Xử lý file JSON với debug"""
         try:
             with open(file_path, 'r', encoding='utf-8') as f:
                 data = json.load(f)
-            print(f"📊 JSON data type: {type(data)}")
             documents = []
-            def extract_text(obj, current_path="", depth=0):
-                if depth > 10:  # Prevent infinite recursion
-                    return
                 if isinstance(obj, dict):
                     for key, value in obj.items():
-                        new_path = f"{current_path}.{key}" if current_path else key
-                        extract_text(value, new_path, depth + 1)
                 elif isinstance(obj, list):
-                    for i, item in enumerate(obj):
-                        extract_text(item, f"{current_path}[{i}]", depth + 1)
-                elif isinstance(obj, str) and len(obj.strip()) > 5:
-                    clean_text = obj.strip()
-                    documents.append(f"{current_path}: {clean_text}")
-                elif isinstance(obj, (int, float, bool)):
-                    documents.append(f"{current_path}: {obj}")
             extract_text(data)
-            print(f"✅ Đã trích xuất {len(documents)} documents từ JSON")
-            if not documents:
-                raise Exception("Không trích xuất được nội dung từ JSON")
-            return documents[:100]  # Limit to avoid overload
         except Exception as e:
             raise Exception(f"Lỗi đọc JSON: {str(e)}")

 import json
 import pandas as pd
 from typing import List
+from models.schemas import RAGDocument
 class WikipediaProcessor:
     def __init__(self):
         self.supported_formats = ['.txt', '.csv', '.json']
     def process_uploaded_file(self, file_path: str) -> List[str]:
+        """Xử lý file Wikipedia uploaded"""
         file_ext = os.path.splitext(file_path)[1].lower()
         try:
             if file_ext == '.txt':
+                return self._process_txt_file(file_path)
             elif file_ext == '.csv':
+                return self._process_csv_file(file_path)
             elif file_ext == '.json':
+                return self._process_json_file(file_path)
             else:
                 raise ValueError(f"Định dạng file không được hỗ trợ: {file_ext}")
         except Exception as e:
+            raise Exception(f"Lỗi xử lý file: {str(e)}")
     def _process_txt_file(self, file_path: str) -> List[str]:
+        """Xử lý file text"""
+        with open(file_path, 'r', encoding='utf-8') as f:
+            content = f.read()
+        paragraphs = [p.strip() for p in content.split('\n\n') if p.strip() and len(p.strip()) > 20]
+        return paragraphs
     def _process_csv_file(self, file_path: str) -> List[str]:
+        """Xử lý file CSV"""
         try:
+            df = pd.read_csv(file_path)
             documents = []
+            for _, row in df.iterrows():
                 doc_parts = []
                 for col in df.columns:
                     if pd.notna(row[col]) and str(row[col]).strip():
+                        doc_parts.append(f"{col}: {row[col]}")
                 if doc_parts:
+                    documents.append(" | ".join(doc_parts))
             return documents
         except Exception as e:
             raise Exception(f"Lỗi đọc CSV: {str(e)}")
     def _process_json_file(self, file_path: str) -> List[str]:
+        """Xử lý file JSON"""
         try:
             with open(file_path, 'r', encoding='utf-8') as f:
                 data = json.load(f)
             documents = []
+            def extract_text(obj, current_path=""):
                 if isinstance(obj, dict):
                     for key, value in obj.items():
+                        extract_text(value, f"{current_path}.{key}" if current_path else key)
                 elif isinstance(obj, list):
+                    for item in obj:
+                        extract_text(item, current_path)
+                elif isinstance(obj, str) and len(obj.strip()) > 10:
+                    documents.append(f"{current_path}: {obj.strip()}")
             extract_text(data)
+            return documents
         except Exception as e:
             raise Exception(f"Lỗi đọc JSON: {str(e)}")