wayne0603 commited on
Commit
0f0afda
·
verified ·
1 Parent(s): 1beff81

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -9
app.py CHANGED
@@ -37,14 +37,24 @@ def load_file(file_obj):
37
  docs = []
38
  text_data = ""
39
 
40
- if file_obj.name.endswith(".pdf"):
41
- reader = PdfReader(file_obj.name)
42
- for page in reader.pages:
43
- text_data += page.extract_text() + "\n"
44
- elif file_obj.name.endswith(".txt"):
45
- text_data = file_obj.read().decode("utf-8")
46
- else:
47
- return "仅支持 PDF 或 TXT 文件", None
 
 
 
 
 
 
 
 
 
 
48
 
49
  # 切块
50
  chunks = [text_data[i:i+500] for i in range(0, len(text_data), 500)]
@@ -56,7 +66,6 @@ def load_file(file_obj):
56
  index.add(doc_embeddings)
57
 
58
  return f"已加载 {len(docs)} 个文本块", None
59
-
60
  # ===== RAG 查询函数 =====
61
  def rag_query(query):
62
  if index is None:
 
37
  docs = []
38
  text_data = ""
39
 
40
+ ext = os.path.splitext(file_obj.name)[1].lower()
41
+
42
+ try:
43
+ if ext == ".pdf":
44
+ reader = PdfReader(file_obj.name)
45
+ for page in reader.pages:
46
+ page_text = page.extract_text()
47
+ if page_text:
48
+ text_data += page_text + "\n"
49
+ elif ext == ".txt":
50
+ text_data = file_obj.read().decode("utf-8", errors="ignore")
51
+ else:
52
+ return "仅支持 PDF 或 TXT 文件", None
53
+ except Exception as e:
54
+ return f"文件解析失败: {str(e)}", None
55
+
56
+ if not text_data.strip():
57
+ return "未能从文件中提取到文本", None
58
 
59
  # 切块
60
  chunks = [text_data[i:i+500] for i in range(0, len(text_data), 500)]
 
66
  index.add(doc_embeddings)
67
 
68
  return f"已加载 {len(docs)} 个文本块", None
 
69
  # ===== RAG 查询函数 =====
70
  def rag_query(query):
71
  if index is None: