ss900371tw commited on
Commit
1bd9676
·
verified ·
1 Parent(s): 3b0fca1

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +8 -12
src/streamlit_app.py CHANGED
@@ -115,8 +115,8 @@ if "messages" not in st.session_state:
115
  st.session_state.messages = []
116
 
117
  # === 修改:檔案處理邏輯 (轉為 FAISS) ===
118
- def process_file_to_faiss(uploaded_file, chunk_size, chunk_overlap=50):
119
- """讀取檔案 -> 切塊 -> 建立 FAISS 索引"""
120
  text_content = ""
121
  try:
122
  # 1. 讀取文字
@@ -134,17 +134,13 @@ def process_file_to_faiss(uploaded_file, chunk_size, chunk_overlap=50):
134
  if not text_content.strip():
135
  return None, "File is empty"
136
 
137
- # 2. 切分文本 (Chunking)
138
- text_splitter = RecursiveCharacterTextSplitter(
139
- chunk_size=chunk_size,
140
- chunk_overlap=chunk_overlap,
141
- separators=["\n\n", "\n", " ", ""]
142
- )
143
- # 將純文字包裝成 LangChain Document 物件
144
- docs = [Document(page_content=x) for x in text_splitter.split_text(text_content)]
145
 
146
- # 3. 建立/更新 FAISS
147
- # 如果 session 中已有 vector_store,可以選擇合併,這裡示範每次上傳新檔就重建一個新的
148
  vector_store = FAISS.from_documents(docs, embedding_model)
149
 
150
  return vector_store, f"Success: {len(docs)} chunks created."
 
115
  st.session_state.messages = []
116
 
117
  # === 修改:檔案處理邏輯 (轉為 FAISS) ===
118
+ def process_file_to_faiss(uploaded_file, chunk_overlap=0):
119
+ """讀取檔案 -> </Event> 分割 -> 建立 FAISS 索引"""
120
  text_content = ""
121
  try:
122
  # 1. 讀取文字
 
134
  if not text_content.strip():
135
  return None, "File is empty"
136
 
137
+ # 2. </Event> 做分割
138
+ events = [e + "</Event>" for e in text_content.split("</Event>") if e.strip()]
139
+
140
+ # 3. 封裝成 Document
141
+ docs = [Document(page_content=e) for e in events]
 
 
 
142
 
143
+ # 4. 建立 FAISS
 
144
  vector_store = FAISS.from_documents(docs, embedding_model)
145
 
146
  return vector_store, f"Success: {len(docs)} chunks created."