Spaces:

datbkpro
/

voicebot

Sleeping

App Files Files Community

datbkpro commited on Oct 22, 2025

Commit

6469f7e

verified ·

1 Parent(s): 8be02b3

Update core/wikipedia_processor.py

Browse files

Files changed (1) hide show

core/wikipedia_processor.py +52 -15

core/wikipedia_processor.py CHANGED Viewed

@@ -1,9 +1,3 @@
-import os
-import json
-import pandas as pd
-from typing import List
-from models.schemas import RAGDocument
 class WikipediaProcessor:
     def __init__(self):
         self.supported_formats = ['.txt', '.csv', '.json']
@@ -13,6 +7,8 @@ class WikipediaProcessor:
         file_ext = os.path.splitext(file_path)[1].lower()
         try:
             if file_ext == '.txt':
                 return self._process_txt_file(file_path)
             elif file_ext == '.csv':
@@ -22,36 +18,73 @@ class WikipediaProcessor:
             else:
                 raise ValueError(f"Định dạng file không được hỗ trợ: {file_ext}")
         except Exception as e:
             raise Exception(f"Lỗi xử lý file: {str(e)}")
     def _process_txt_file(self, file_path: str) -> List[str]:
         """Xử lý file text"""
-        with open(file_path, 'r', encoding='utf-8') as f:
-            content = f.read()
-        paragraphs = [p.strip() for p in content.split('\n\n') if p.strip() and len(p.strip()) > 20]
-        return paragraphs
     def _process_csv_file(self, file_path: str) -> List[str]:
         """Xử lý file CSV"""
         try:
             df = pd.read_csv(file_path)
             documents = []
-            for _, row in df.iterrows():
                 doc_parts = []
                 for col in df.columns:
                     if pd.notna(row[col]) and str(row[col]).strip():
                         doc_parts.append(f"{col}: {row[col]}")
                 if doc_parts:
-                    documents.append(" | ".join(doc_parts))
             return documents
         except Exception as e:
             raise Exception(f"Lỗi đọc CSV: {str(e)}")
     def _process_json_file(self, file_path: str) -> List[str]:
         """Xử lý file JSON"""
         try:
             with open(file_path, 'r', encoding='utf-8') as f:
                 data = json.load(f)
@@ -63,12 +96,16 @@ class WikipediaProcessor:
                     for key, value in obj.items():
                         extract_text(value, f"{current_path}.{key}" if current_path else key)
                 elif isinstance(obj, list):
-                    for item in obj:
-                        extract_text(item, current_path)
                 elif isinstance(obj, str) and len(obj.strip()) > 10:
                     documents.append(f"{current_path}: {obj.strip()}")
             extract_text(data)
             return documents
         except Exception as e:
             raise Exception(f"Lỗi đọc JSON: {str(e)}")

 class WikipediaProcessor:
     def __init__(self):
         self.supported_formats = ['.txt', '.csv', '.json']
         file_ext = os.path.splitext(file_path)[1].lower()
         try:
+            print(f"🔄 Đang xử lý file: {file_path}, định dạng: {file_ext}")
             if file_ext == '.txt':
                 return self._process_txt_file(file_path)
             elif file_ext == '.csv':
             else:
                 raise ValueError(f"Định dạng file không được hỗ trợ: {file_ext}")
         except Exception as e:
+            print(f"❌ Lỗi xử lý file: {traceback.format_exc()}")
             raise Exception(f"Lỗi xử lý file: {str(e)}")
     def _process_txt_file(self, file_path: str) -> List[str]:
         """Xử lý file text"""
+        print(f"📖 Đọc file text: {file_path}")
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+            # Multiple splitting strategies
+            paragraphs = []
+            # Try splitting by double newlines first
+            if '\n\n' in content:
+                paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
+            else:
+                # Try splitting by single newlines
+                paragraphs = [p.strip() for p in content.split('\n') if p.strip()]
+            # Filter by length
+            paragraphs = [p for p in paragraphs if len(p.strip()) > 10]
+            print(f"✅ Đã trích xuất {len(paragraphs)} đoạn văn từ file text")
+            return paragraphs
+        except UnicodeDecodeError:
+            # Try with different encoding
+            with open(file_path, 'r', encoding='latin-1') as f:
+                content = f.read()
+            paragraphs = [p.strip() for p in content.split('\n\n') if p.strip() and len(p.strip()) > 10]
+            print(f"✅ Đã trích xuất {len(paragraphs)} đoạn văn từ file text (latin-1)")
+            return paragraphs
     def _process_csv_file(self, file_path: str) -> List[str]:
         """Xử lý file CSV"""
+        print(f"📊 Đọc file CSV: {file_path}")
         try:
             df = pd.read_csv(file_path)
             documents = []
+            print(f"📋 CSV có {len(df)} hàng và {len(df.columns)} cột")
+            for idx, row in df.iterrows():
                 doc_parts = []
                 for col in df.columns:
                     if pd.notna(row[col]) and str(row[col]).strip():
                         doc_parts.append(f"{col}: {row[col]}")
                 if doc_parts:
+                    full_doc = " | ".join(doc_parts)
+                    if len(full_doc) > 10:  # Ensure minimum length
+                        documents.append(full_doc)
+                if idx < 3:  # Log first few rows
+                    print(f"📝 Hàng {idx}: {doc_parts}")
+            print(f"✅ Đã trích xuất {len(documents)} documents từ CSV")
             return documents
         except Exception as e:
+            print(f"❌ Lỗi đọc CSV: {traceback.format_exc()}")
             raise Exception(f"Lỗi đọc CSV: {str(e)}")
     def _process_json_file(self, file_path: str) -> List[str]:
         """Xử lý file JSON"""
+        print(f"📄 Đọc file JSON: {file_path}")
         try:
             with open(file_path, 'r', encoding='utf-8') as f:
                 data = json.load(f)
                     for key, value in obj.items():
                         extract_text(value, f"{current_path}.{key}" if current_path else key)
                 elif isinstance(obj, list):
+                    for i, item in enumerate(obj):
+                        extract_text(item, f"{current_path}[{i}]")
                 elif isinstance(obj, str) and len(obj.strip()) > 10:
                     documents.append(f"{current_path}: {obj.strip()}")
+                elif isinstance(obj, (int, float, bool)):
+                    documents.append(f"{current_path}: {obj}")
             extract_text(data)
+            print(f"✅ Đã trích xuất {len(documents)} documents từ JSON")
             return documents
         except Exception as e:
+            print(f"❌ Lỗi đọc JSON: {traceback.format_exc()}")
             raise Exception(f"Lỗi đọc JSON: {str(e)}")