Spaces:
Sleeping
Sleeping
Update utils/ingestion.py
Browse files- utils/ingestion.py +11 -5
utils/ingestion.py
CHANGED
|
@@ -96,16 +96,22 @@ class DocumentProcessor:
|
|
| 96 |
ids = []
|
| 97 |
|
| 98 |
for idx, chunk in enumerate(processed_chunks):
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
embeddings.append(embedding)
|
| 102 |
metadata_list.append({
|
| 103 |
-
"headings": json.dumps(chunk
|
| 104 |
-
"page": chunk
|
| 105 |
-
"content_type": chunk
|
| 106 |
})
|
| 107 |
ids.append(str(idx))
|
| 108 |
|
|
|
|
| 109 |
collection.add(
|
| 110 |
ids=ids,
|
| 111 |
embeddings=embeddings,
|
|
|
|
| 96 |
ids = []
|
| 97 |
|
| 98 |
for idx, chunk in enumerate(processed_chunks):
|
| 99 |
+
text = chunk.get('text', '').strip()
|
| 100 |
+
if not text:
|
| 101 |
+
print(f"Skipping empty chunk at index {idx}")
|
| 102 |
+
continue # Skip empty chunks
|
| 103 |
+
|
| 104 |
+
embedding = self.embed_model.embed_documents([text])[0] # ✅ Correct method
|
| 105 |
+
documents.append(text)
|
| 106 |
embeddings.append(embedding)
|
| 107 |
metadata_list.append({
|
| 108 |
+
"headings": json.dumps(chunk.get('headings', [])),
|
| 109 |
+
"page": chunk.get('page_info', None),
|
| 110 |
+
"content_type": chunk.get('content_type', None)
|
| 111 |
})
|
| 112 |
ids.append(str(idx))
|
| 113 |
|
| 114 |
+
|
| 115 |
collection.add(
|
| 116 |
ids=ids,
|
| 117 |
embeddings=embeddings,
|