desolo-2918 commited on
Commit
702e91c
·
verified ·
1 Parent(s): 13cf9be

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -1
app.py CHANGED
@@ -62,7 +62,10 @@ def initialize_policy_db():
62
  reader = PyPDF2.PdfReader(f)
63
  policy_text = "".join([page.extract_text() for page in reader.pages])
64
 
65
- policy_chunks = policy_text.split("\n\n")
 
 
 
66
  ids = [f"chunk_{i}" for i in range(len(policy_chunks))]
67
  embeddings = embedder.encode(policy_chunks).tolist()
68
  collection.add(documents=policy_chunks, embeddings=embeddings, ids=ids)
 
62
  reader = PyPDF2.PdfReader(f)
63
  policy_text = "".join([page.extract_text() for page in reader.pages])
64
 
65
+ # There was an issue where the entire policy pdf was being passed in, potentially due to incorrect scraping of n/n/ so switched to characters
66
+ chunk_size = 1000
67
+ policy_chunks = [policy_text[i:i + chunk_size] for i in range(0, len(policy_text), chunk_size)]
68
+
69
  ids = [f"chunk_{i}" for i in range(len(policy_chunks))]
70
  embeddings = embedder.encode(policy_chunks).tolist()
71
  collection.add(documents=policy_chunks, embeddings=embeddings, ids=ids)