Update app.py
Browse files
app.py
CHANGED
|
@@ -62,7 +62,10 @@ def initialize_policy_db():
|
|
| 62 |
reader = PyPDF2.PdfReader(f)
|
| 63 |
policy_text = "".join([page.extract_text() for page in reader.pages])
|
| 64 |
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
| 66 |
ids = [f"chunk_{i}" for i in range(len(policy_chunks))]
|
| 67 |
embeddings = embedder.encode(policy_chunks).tolist()
|
| 68 |
collection.add(documents=policy_chunks, embeddings=embeddings, ids=ids)
|
|
|
|
| 62 |
reader = PyPDF2.PdfReader(f)
|
| 63 |
policy_text = "".join([page.extract_text() for page in reader.pages])
|
| 64 |
|
| 65 |
+
# There was an issue where the entire policy pdf was being passed in, potentially due to incorrect scraping of n/n/ so switched to characters
|
| 66 |
+
chunk_size = 1000
|
| 67 |
+
policy_chunks = [policy_text[i:i + chunk_size] for i in range(0, len(policy_text), chunk_size)]
|
| 68 |
+
|
| 69 |
ids = [f"chunk_{i}" for i in range(len(policy_chunks))]
|
| 70 |
embeddings = embedder.encode(policy_chunks).tolist()
|
| 71 |
collection.add(documents=policy_chunks, embeddings=embeddings, ids=ids)
|