ShynBui commited on
Commit
90e90fa
·
verified ·
1 Parent(s): 96cab2c

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +13 -5
utils.py CHANGED
@@ -11,11 +11,12 @@ def split_with_source(text, source):
11
  splitter = CharacterTextSplitter(
12
  separator = "\n",
13
  chunk_size = 256,
14
- chunk_overlap = 72,
15
  length_function = len,
16
  add_start_index = True,
17
  )
18
  documents = splitter.create_documents([text])
 
19
  for doc in documents:
20
  doc.metadata["source"] = source
21
  # print(doc.metadata)
@@ -44,6 +45,7 @@ def get_document_from_raw_text():
44
  for i in files:
45
  file_path = i
46
  with open(os.path.join(os.path.join(os.getcwd(), "raw_data"),file_path), 'r', encoding="utf-8") as file:
 
47
  # Tiền xử lý văn bản
48
  content = file.read().replace('\n\n', "\n")
49
  # content = ''.join(content.split('.'))
@@ -51,22 +53,28 @@ def get_document_from_raw_text():
51
  texts = split_with_source(new_doc, i)
52
  documents = documents + texts
53
 
 
 
 
 
 
 
54
  return documents
55
 
56
  def load_the_embedding_retrieve(is_ready = False, k = 3, model= 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'):
 
57
  if is_ready:
58
- embeddings = HuggingFaceEmbeddings(model_name=model)
59
  retriever = Chroma(persist_directory=os.path.join(os.getcwd(), "Data"), embedding_function=embeddings).as_retriever(
60
  search_kwargs={"k": k}
61
  )
62
  else:
63
-
64
  documents = get_document_from_raw_text()
65
-
66
- retriever = Chroma.from_documents(documents, embedding=model).as_retriever(
67
  search_kwargs={"k": k}
68
  )
69
 
 
70
  return retriever
71
 
72
  def load_the_bm25_retrieve(k = 3):
 
11
  splitter = CharacterTextSplitter(
12
  separator = "\n",
13
  chunk_size = 256,
14
+ chunk_overlap = 0,
15
  length_function = len,
16
  add_start_index = True,
17
  )
18
  documents = splitter.create_documents([text])
19
+ print(documents)
20
  for doc in documents:
21
  doc.metadata["source"] = source
22
  # print(doc.metadata)
 
45
  for i in files:
46
  file_path = i
47
  with open(os.path.join(os.path.join(os.getcwd(), "raw_data"),file_path), 'r', encoding="utf-8") as file:
48
+ # Xử lý bằng text_spliter
49
  # Tiền xử lý văn bản
50
  content = file.read().replace('\n\n', "\n")
51
  # content = ''.join(content.split('.'))
 
53
  texts = split_with_source(new_doc, i)
54
  documents = documents + texts
55
 
56
+ ##Xử lý mỗi khi xuống dòng
57
+ # for line in file:
58
+ # # Loại bỏ khoảng trắng thừa và ký tự xuống dòng ở đầu và cuối mỗi dòng
59
+ # line = line.strip()
60
+ # documents.append(Document(page_content=line, metadata={"source": i}))
61
+ print(documents)
62
  return documents
63
 
64
  def load_the_embedding_retrieve(is_ready = False, k = 3, model= 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'):
65
+ embeddings = HuggingFaceEmbeddings(model_name=model)
66
  if is_ready:
 
67
  retriever = Chroma(persist_directory=os.path.join(os.getcwd(), "Data"), embedding_function=embeddings).as_retriever(
68
  search_kwargs={"k": k}
69
  )
70
  else:
 
71
  documents = get_document_from_raw_text()
72
+ print(type(documents))
73
+ retriever = Chroma.from_documents(documents, embeddings).as_retriever(
74
  search_kwargs={"k": k}
75
  )
76
 
77
+
78
  return retriever
79
 
80
  def load_the_bm25_retrieve(k = 3):