Spaces:
Sleeping
Sleeping
minor changes
Browse files- app.py +1 -2
- docling_chunker.py +17 -0
- logs.txt +1249 -0
- tokenizer.py +50 -0
- uv.lock +0 -0
app.py
CHANGED
|
@@ -51,7 +51,6 @@ def verify_bearer_token(authorization: Optional[str]) -> None:
|
|
| 51 |
async def run_hackrx(
|
| 52 |
request: Request,
|
| 53 |
payload: HackRxRequest,
|
| 54 |
-
background_tasks: BackgroundTasks,
|
| 55 |
authorization: Optional[str] = Header(None)
|
| 56 |
):
|
| 57 |
# Verify Authorization Bearer token
|
|
@@ -66,5 +65,5 @@ async def run_hackrx(
|
|
| 66 |
contexts=get_context_for_questions(questions)
|
| 67 |
prompts=construct_prompts(questions,contexts)
|
| 68 |
answers=generate_answers(prompts)
|
| 69 |
-
|
| 70 |
return {"answers": answers}
|
|
|
|
| 51 |
async def run_hackrx(
|
| 52 |
request: Request,
|
| 53 |
payload: HackRxRequest,
|
|
|
|
| 54 |
authorization: Optional[str] = Header(None)
|
| 55 |
):
|
| 56 |
# Verify Authorization Bearer token
|
|
|
|
| 65 |
contexts=get_context_for_questions(questions)
|
| 66 |
prompts=construct_prompts(questions,contexts)
|
| 67 |
answers=generate_answers(prompts)
|
| 68 |
+
clear_collection_payloads()
|
| 69 |
return {"answers": answers}
|
docling_chunker.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from docling.chunking import HybridChunker
|
| 2 |
+
from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer
|
| 3 |
+
import tiktoken
|
| 4 |
+
|
| 5 |
+
tokenizer = OpenAITokenizer(tokenizer=tiktoken.encoding_for_model("gpt-4o"), max_tokens=128 * 1024)
|
| 6 |
+
|
| 7 |
+
chunker=HybridChunker(
|
| 8 |
+
tokenizer=tokenizer,
|
| 9 |
+
max_tokens=3000,
|
| 10 |
+
min_tokens=1000,
|
| 11 |
+
merge_peers=True
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
chunk_iter=chunker.chunk(dl_doc="content.txt")
|
| 15 |
+
chunks = list(chunk_iter)
|
| 16 |
+
print(chunks)
|
| 17 |
+
print(len(chunks))
|
logs.txt
ADDED
|
@@ -0,0 +1,1249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
===== Application Startup at 2025-07-31 17:10:25 =====
|
| 2 |
+
|
| 3 |
+
INFO: Started server process [1]
|
| 4 |
+
INFO: Waiting for application startup.
|
| 5 |
+
INFO: Application startup complete.
|
| 6 |
+
INFO: Uvicorn running on http://0.0.0.0:7860 (Press CTRL+C to quit)
|
| 7 |
+
Collection 'test' already exists
|
| 8 |
+
https://hackrx.blob.core.windows.net/assets/policy.pdf?sv=2023-01-03&st=2025-07-04T09%3A11%3A24Z&se=2027-07-05T09%3A11%3A00Z&sr=b&sp=r&sig=N4a9OU0w0QXO6AOIBiu4bpl7AXvEZogeT%2FjUHNO7HzQ%3D
|
| 9 |
+
['What is the grace period for premium payment under the National Parivar Mediclaim Plus Policy?', 'What is the waiting period for pre-existing diseases (PED) to be covered?', 'Does this policy cover maternity expenses, and what are the conditions?', 'What is the waiting period for cataract surgery?', 'Are the medical expenses for an organ donor covered under this policy?', 'What is the No Claim Discount (NCD) offered in this policy?', 'Is there a benefit for preventive health check-ups?', "How does the policy define a 'Hospital'?", 'What is the extent of coverage for AYUSH treatments?', 'Are there any sub-limits on room rent and ICU charges for Plan A?']
|
| 10 |
+
|
| 11 |
+
Batch 1:
|
| 12 |
+
Chunk 1 word count: 165
|
| 13 |
+
Chunk 2 word count: 165
|
| 14 |
+
Chunk 3 word count: 1995
|
| 15 |
+
Chunk 4 word count: 1965
|
| 16 |
+
Chunk 5 word count: 1671
|
| 17 |
+
Chunk 6 word count: 378
|
| 18 |
+
Chunk 7 word count: 1988
|
| 19 |
+
Chunk 8 word count: 221
|
| 20 |
+
Chunk 9 word count: 1902
|
| 21 |
+
Chunk 10 word count: 1907
|
| 22 |
+
Chunk 11 word count: 119
|
| 23 |
+
Chunk 12 word count: 880
|
| 24 |
+
Chunk 13 word count: 166
|
| 25 |
+
Chunk 14 word count: 1634
|
| 26 |
+
Chunk 15 word count: 1556
|
| 27 |
+
Chunk 16 word count: 1802
|
| 28 |
+
|
| 29 |
+
Batch 2:
|
| 30 |
+
Chunk 1 word count: 422
|
| 31 |
+
|
| 32 |
+
Summary:
|
| 33 |
+
Chunks split by level 1 headings (#): 1
|
| 34 |
+
Chunks split by grouped level 2 headings (##): 10
|
| 35 |
+
Chunks split by paragraphs: 2
|
| 36 |
+
Batch 1 embeddings received, total embeddings so far: 16
|
| 37 |
+
Batch 2 embeddings received, total embeddings so far: 17
|
| 38 |
+
Upserted points 0 to 16
|
| 39 |
+
Total upserted points: 17
|
| 40 |
+
processing complete
|
| 41 |
+
INFO: 10.16.23.6:6263 - "POST /api/v1/hackrx/run HTTP/1.1" 200 OK
|
| 42 |
+
delete done
|
| 43 |
+
INFO: 10.16.32.117:59200 - "GET / HTTP/1.1" 404 Not Found
|
| 44 |
+
https://hackrx.blob.core.windows.net/assets/Arogya%20Sanjeevani%20Policy%20-%20CIN%20-%20U10200WB1906GOI001713%201.pdf?sv=2023-01-03&st=2025-07-21T08%3A29%3A02Z&se=2025-09-22T08%3A29%3A00Z&sr=b&sp=r&sig=nzrz1K9Iurt%2BBXom%2FB%2BMPTFMFP3PRnIvEsipAX10Ig4%3D
|
| 45 |
+
['When will my root canal claim of Rs 25,000 be settled?', 'I have done an IVF for Rs 56,000. Is it covered?', 'I did a cataract treatment of Rs 100,000. Will you settle the full Rs 100,000?', 'Give me a list of documents to be uploaded for hospitalization for heart surgery.']
|
| 46 |
+
|
| 47 |
+
Batch 1:
|
| 48 |
+
Chunk 1 word count: 189
|
| 49 |
+
Chunk 2 word count: 119
|
| 50 |
+
Chunk 3 word count: 1952
|
| 51 |
+
Chunk 4 word count: 1937
|
| 52 |
+
Chunk 5 word count: 506
|
| 53 |
+
Chunk 6 word count: 567
|
| 54 |
+
Chunk 7 word count: 1081
|
| 55 |
+
Chunk 8 word count: 1818
|
| 56 |
+
Chunk 9 word count: 1990
|
| 57 |
+
Chunk 10 word count: 699
|
| 58 |
+
Chunk 11 word count: 1887
|
| 59 |
+
Chunk 12 word count: 956
|
| 60 |
+
|
| 61 |
+
Summary:
|
| 62 |
+
Chunks split by level 1 headings (#): 2
|
| 63 |
+
Chunks split by grouped level 2 headings (##): 9
|
| 64 |
+
Chunks split by paragraphs: 0
|
| 65 |
+
Batch 1 embeddings received, total embeddings so far: 12
|
| 66 |
+
Upserted points 0 to 11
|
| 67 |
+
Total upserted points: 12
|
| 68 |
+
processing complete
|
| 69 |
+
INFO: 10.16.21.252:15207 - "POST /api/v1/hackrx/run HTTP/1.1" 200 OK
|
| 70 |
+
delete done
|
| 71 |
+
https://hackrx.blob.core.windows.net/assets/Arogya%20Sanjeevani%20Policy%20-%20CIN%20-%20U10200WB1906GOI001713%201.pdf?sv=2023-01-03&st=2025-07-21T08%3A29%3A02Z&se=2025-09-22T08%3A29%3A00Z&sr=b&sp=r&sig=nzrz1K9Iurt%2BBXom%2FB%2BMPTFMFP3PRnIvEsipAX10Ig4%3D
|
| 72 |
+
["I have raised a claim for hospitalization for Rs 200,000 with HDFC, and it's approved. My total expenses are Rs 250,000. Can I raise the remaining Rs 50,000 with you?"]
|
| 73 |
+
|
| 74 |
+
Batch 1:
|
| 75 |
+
Chunk 1 word count: 189
|
| 76 |
+
Chunk 2 word count: 119
|
| 77 |
+
Chunk 3 word count: 1952
|
| 78 |
+
Chunk 4 word count: 1937
|
| 79 |
+
Chunk 5 word count: 506
|
| 80 |
+
Chunk 6 word count: 567
|
| 81 |
+
Chunk 7 word count: 1081
|
| 82 |
+
Chunk 8 word count: 1818
|
| 83 |
+
Chunk 9 word count: 1990
|
| 84 |
+
Chunk 10 word count: 699
|
| 85 |
+
Chunk 11 word count: 1887
|
| 86 |
+
Chunk 12 word count: 956
|
| 87 |
+
|
| 88 |
+
Summary:
|
| 89 |
+
Chunks split by level 1 headings (#): 2
|
| 90 |
+
Chunks split by grouped level 2 headings (##): 9
|
| 91 |
+
Chunks split by paragraphs: 0
|
| 92 |
+
Batch 1 embeddings received, total embeddings so far: 12
|
| 93 |
+
Upserted points 0 to 11
|
| 94 |
+
Total upserted points: 12
|
| 95 |
+
processing complete
|
| 96 |
+
INFO: 10.16.23.6:7493 - "POST /api/v1/hackrx/run HTTP/1.1" 200 OK
|
| 97 |
+
delete done
|
| 98 |
+
https://hackrx.blob.core.windows.net/assets/Super_Splendor_(Feb_2023).pdf?sv=2023-01-03&st=2025-07-21T08%3A10%3A00Z&se=2025-09-22T08%3A10%3A00Z&sr=b&sp=r&sig=vhHrl63YtrEOCsAy%2BpVKr20b3ZUo5HMz1lF9%2BJh6LQ0%3D
|
| 99 |
+
['What is the ideal spark plug gap recommeded', 'Does this comes in tubeless tyre version', 'Is it compulsoury to have a disc brake', 'Can I put thums up instead of oil', 'Give me JS code to generate a random number between 1 and 100']
|
| 100 |
+
|
| 101 |
+
Batch 1:
|
| 102 |
+
Chunk 1 word count: 257
|
| 103 |
+
Chunk 2 word count: 129
|
| 104 |
+
Chunk 3 word count: 378
|
| 105 |
+
Chunk 4 word count: 1182
|
| 106 |
+
Chunk 5 word count: 335
|
| 107 |
+
Chunk 6 word count: 837
|
| 108 |
+
Chunk 7 word count: 356
|
| 109 |
+
Chunk 8 word count: 248
|
| 110 |
+
Chunk 9 word count: 136
|
| 111 |
+
Chunk 10 word count: 419
|
| 112 |
+
Chunk 11 word count: 231
|
| 113 |
+
Chunk 12 word count: 94
|
| 114 |
+
Chunk 13 word count: 38
|
| 115 |
+
Chunk 14 word count: 472
|
| 116 |
+
Chunk 15 word count: 53
|
| 117 |
+
Chunk 16 word count: 134
|
| 118 |
+
|
| 119 |
+
Batch 2:
|
| 120 |
+
Chunk 1 word count: 56
|
| 121 |
+
Chunk 2 word count: 623
|
| 122 |
+
Chunk 3 word count: 177
|
| 123 |
+
Chunk 4 word count: 510
|
| 124 |
+
Chunk 5 word count: 107
|
| 125 |
+
Chunk 6 word count: 155
|
| 126 |
+
Chunk 7 word count: 461
|
| 127 |
+
Chunk 8 word count: 104
|
| 128 |
+
Chunk 9 word count: 97
|
| 129 |
+
Chunk 10 word count: 119
|
| 130 |
+
Chunk 11 word count: 356
|
| 131 |
+
Chunk 12 word count: 67
|
| 132 |
+
Chunk 13 word count: 145
|
| 133 |
+
Chunk 14 word count: 575
|
| 134 |
+
Chunk 15 word count: 246
|
| 135 |
+
Chunk 16 word count: 824
|
| 136 |
+
|
| 137 |
+
Batch 3:
|
| 138 |
+
Chunk 1 word count: 1063
|
| 139 |
+
Chunk 2 word count: 1574
|
| 140 |
+
Chunk 3 word count: 708
|
| 141 |
+
Chunk 4 word count: 232
|
| 142 |
+
Chunk 5 word count: 276
|
| 143 |
+
Chunk 6 word count: 237
|
| 144 |
+
Chunk 7 word count: 1995
|
| 145 |
+
Chunk 8 word count: 261
|
| 146 |
+
Chunk 9 word count: 154
|
| 147 |
+
Chunk 10 word count: 370
|
| 148 |
+
Chunk 11 word count: 47
|
| 149 |
+
Chunk 12 word count: 18
|
| 150 |
+
Chunk 13 word count: 91
|
| 151 |
+
Chunk 14 word count: 52
|
| 152 |
+
Chunk 15 word count: 10
|
| 153 |
+
Chunk 16 word count: 11
|
| 154 |
+
|
| 155 |
+
Batch 4:
|
| 156 |
+
Chunk 1 word count: 145
|
| 157 |
+
Chunk 2 word count: 44
|
| 158 |
+
Chunk 3 word count: 78
|
| 159 |
+
Chunk 4 word count: 202
|
| 160 |
+
Chunk 5 word count: 32
|
| 161 |
+
Chunk 6 word count: 74
|
| 162 |
+
Chunk 7 word count: 327
|
| 163 |
+
Chunk 8 word count: 192
|
| 164 |
+
Chunk 9 word count: 316
|
| 165 |
+
Chunk 10 word count: 541
|
| 166 |
+
Chunk 11 word count: 135
|
| 167 |
+
Chunk 12 word count: 228
|
| 168 |
+
Chunk 13 word count: 890
|
| 169 |
+
Chunk 14 word count: 457
|
| 170 |
+
Chunk 15 word count: 210
|
| 171 |
+
Chunk 16 word count: 122
|
| 172 |
+
|
| 173 |
+
Batch 5:
|
| 174 |
+
Chunk 1 word count: 102
|
| 175 |
+
Chunk 2 word count: 635
|
| 176 |
+
|
| 177 |
+
Summary:
|
| 178 |
+
Chunks split by level 1 headings (#): 63
|
| 179 |
+
Chunks split by grouped level 2 headings (##): 3
|
| 180 |
+
Chunks split by paragraphs: 0
|
| 181 |
+
Batch 1 embeddings received, total embeddings so far: 16
|
| 182 |
+
Batch 2 embeddings received, total embeddings so far: 32
|
| 183 |
+
Batch 3 embeddings received, total embeddings so far: 48
|
| 184 |
+
Batch 4 embeddings received, total embeddings so far: 64
|
| 185 |
+
Batch 5 embeddings received, total embeddings so far: 66
|
| 186 |
+
Upserted points 0 to 19
|
| 187 |
+
Upserted points 20 to 39
|
| 188 |
+
Upserted points 40 to 59
|
| 189 |
+
Upserted points 60 to 65
|
| 190 |
+
Total upserted points: 66
|
| 191 |
+
processing complete
|
| 192 |
+
INFO: 10.16.32.117:27543 - "POST /api/v1/hackrx/run HTTP/1.1" 200 OK
|
| 193 |
+
delete done
|
| 194 |
+
https://hackrx.blob.core.windows.net/assets/Family%20Medicare%20Policy%20(UIN-%20UIIHLIP22070V042122)%201.pdf?sv=2023-01-03&st=2025-07-22T10%3A17%3A39Z&se=2025-08-23T10%3A17%3A00Z&sr=b&sp=r&sig=dA7BEMIZg3WcePcckBOb4QjfxK%2B4rIfxBs2%2F%2BNwoPjQ%3D
|
| 195 |
+
['Is Non-infective Arthritis covered?', 'I renewed my policy yesterday, and I have been a customer for the last 6 years. Can I raise a claim for Hydrocele?', 'Is abortion covered?']
|
| 196 |
+
|
| 197 |
+
Batch 1:
|
| 198 |
+
Chunk 1 word count: 315
|
| 199 |
+
Chunk 2 word count: 90
|
| 200 |
+
Chunk 3 word count: 1997
|
| 201 |
+
Chunk 4 word count: 1939
|
| 202 |
+
Chunk 5 word count: 667
|
| 203 |
+
Chunk 6 word count: 682
|
| 204 |
+
Chunk 7 word count: 1612
|
| 205 |
+
Chunk 8 word count: 1771
|
| 206 |
+
Chunk 9 word count: 1956
|
| 207 |
+
Chunk 10 word count: 1956
|
| 208 |
+
Chunk 11 word count: 77
|
| 209 |
+
Chunk 12 word count: 1929
|
| 210 |
+
Chunk 13 word count: 206
|
| 211 |
+
Chunk 14 word count: 1886
|
| 212 |
+
Chunk 15 word count: 1018
|
| 213 |
+
|
| 214 |
+
Summary:
|
| 215 |
+
Chunks split by level 1 headings (#): 0
|
| 216 |
+
Chunks split by grouped level 2 headings (##): 12
|
| 217 |
+
Chunks split by paragraphs: 0
|
| 218 |
+
Batch 1 embeddings received, total embeddings so far: 15
|
| 219 |
+
Upserted points 0 to 14
|
| 220 |
+
Total upserted points: 15
|
| 221 |
+
processing complete
|
| 222 |
+
INFO: 10.16.26.149:56262 - "POST /api/v1/hackrx/run HTTP/1.1" 200 OK
|
| 223 |
+
delete done
|
| 224 |
+
https://hackrx.blob.core.windows.net/assets/indian_constitution.pdf?sv=2023-01-03&st=2025-07-28T06%3A42%3A00Z&se=2026-11-29T06%3A42%3A00Z&sr=b&sp=r&sig=5Gs%2FOXqP3zY00lgciu4BZjDV5QjTDIx7fgnfdz6Pu24%3D
|
| 225 |
+
['What is the official name of India according to Article 1 of the Constitution?', 'Which Article guarantees equality before the law and equal protection of laws to all persons?', 'What is abolished by Article 17 of the Constitution?', 'What are the key ideals mentioned in the Preamble of the Constitution of India?', 'Under which Article can Parliament alter the boundaries, area, or name of an existing State?', 'According to Article 24, children below what age are prohibited from working in hazardous industries like factories or mines?', 'What is the significance of Article 21 in the Indian Constitution?', 'Article 15 prohibits discrimination on certain grounds. However, which groups can the State make special provisions for under this Article?', 'Which Article allows Parliament to regulate the right of citizenship and override previous articles on citizenship (Articles 5 to 10)?', 'What restrictions can the State impose on the right to freedom of speech under Article 19(2)?']
|
| 226 |
+
|
| 227 |
+
Batch 1:
|
| 228 |
+
Chunk 1 word count: 1635
|
| 229 |
+
Chunk 2 word count: 1807
|
| 230 |
+
Chunk 3 word count: 74
|
| 231 |
+
Chunk 4 word count: 449
|
| 232 |
+
Chunk 5 word count: 875
|
| 233 |
+
Chunk 6 word count: 207
|
| 234 |
+
Chunk 7 word count: 19
|
| 235 |
+
Chunk 8 word count: 3
|
| 236 |
+
Chunk 9 word count: 180
|
| 237 |
+
Chunk 10 word count: 15
|
| 238 |
+
Chunk 11 word count: 7
|
| 239 |
+
Chunk 12 word count: 819
|
| 240 |
+
Chunk 13 word count: 3
|
| 241 |
+
Chunk 14 word count: 32
|
| 242 |
+
Chunk 15 word count: 19
|
| 243 |
+
Chunk 16 word count: 3
|
| 244 |
+
|
| 245 |
+
Batch 2:
|
| 246 |
+
Chunk 1 word count: 576
|
| 247 |
+
Chunk 2 word count: 119
|
| 248 |
+
Chunk 3 word count: 1989
|
| 249 |
+
Chunk 4 word count: 1809
|
| 250 |
+
Chunk 5 word count: 1786
|
| 251 |
+
Chunk 6 word count: 1789
|
| 252 |
+
Chunk 7 word count: 1995
|
| 253 |
+
Chunk 8 word count: 1834
|
| 254 |
+
Chunk 9 word count: 1948
|
| 255 |
+
Chunk 10 word count: 1735
|
| 256 |
+
Chunk 11 word count: 1779
|
| 257 |
+
Chunk 12 word count: 203
|
| 258 |
+
Chunk 13 word count: 1981
|
| 259 |
+
Chunk 14 word count: 1996
|
| 260 |
+
Chunk 15 word count: 1895
|
| 261 |
+
Chunk 16 word count: 1641
|
| 262 |
+
|
| 263 |
+
Batch 3:
|
| 264 |
+
Chunk 1 word count: 1513
|
| 265 |
+
Chunk 2 word count: 1559
|
| 266 |
+
Chunk 3 word count: 448
|
| 267 |
+
Chunk 4 word count: 732
|
| 268 |
+
Chunk 5 word count: 837
|
| 269 |
+
Chunk 6 word count: 409
|
| 270 |
+
Chunk 7 word count: 609
|
| 271 |
+
Chunk 8 word count: 1677
|
| 272 |
+
Chunk 9 word count: 1928
|
| 273 |
+
Chunk 10 word count: 674
|
| 274 |
+
Chunk 11 word count: 474
|
| 275 |
+
Chunk 12 word count: 436
|
| 276 |
+
Chunk 13 word count: 91
|
| 277 |
+
Chunk 14 word count: 345
|
| 278 |
+
Chunk 15 word count: 430
|
| 279 |
+
Chunk 16 word count: 1774
|
| 280 |
+
|
| 281 |
+
Batch 4:
|
| 282 |
+
Chunk 1 word count: 1076
|
| 283 |
+
Chunk 2 word count: 1596
|
| 284 |
+
Chunk 3 word count: 478
|
| 285 |
+
Chunk 4 word count: 704
|
| 286 |
+
Chunk 5 word count: 1711
|
| 287 |
+
Chunk 6 word count: 679
|
| 288 |
+
Chunk 7 word count: 436
|
| 289 |
+
Chunk 8 word count: 1462
|
| 290 |
+
Chunk 9 word count: 535
|
| 291 |
+
Chunk 10 word count: 296
|
| 292 |
+
Chunk 11 word count: 29
|
| 293 |
+
Chunk 12 word count: 348
|
| 294 |
+
Chunk 13 word count: 1236
|
| 295 |
+
Chunk 14 word count: 420
|
| 296 |
+
Chunk 15 word count: 843
|
| 297 |
+
Chunk 16 word count: 335
|
| 298 |
+
|
| 299 |
+
Batch 5:
|
| 300 |
+
Chunk 1 word count: 378
|
| 301 |
+
Chunk 2 word count: 448
|
| 302 |
+
Chunk 3 word count: 1090
|
| 303 |
+
Chunk 4 word count: 351
|
| 304 |
+
Chunk 5 word count: 411
|
| 305 |
+
Chunk 6 word count: 229
|
| 306 |
+
Chunk 7 word count: 336
|
| 307 |
+
Chunk 8 word count: 324
|
| 308 |
+
Chunk 9 word count: 397
|
| 309 |
+
Chunk 10 word count: 356
|
| 310 |
+
Chunk 11 word count: 314
|
| 311 |
+
Chunk 12 word count: 413
|
| 312 |
+
Chunk 13 word count: 286
|
| 313 |
+
Chunk 14 word count: 294
|
| 314 |
+
Chunk 15 word count: 238
|
| 315 |
+
Chunk 16 word count: 322
|
| 316 |
+
|
| 317 |
+
Batch 6:
|
| 318 |
+
Chunk 1 word count: 364
|
| 319 |
+
Chunk 2 word count: 345
|
| 320 |
+
Chunk 3 word count: 371
|
| 321 |
+
Chunk 4 word count: 339
|
| 322 |
+
Chunk 5 word count: 362
|
| 323 |
+
Chunk 6 word count: 356
|
| 324 |
+
Chunk 7 word count: 253
|
| 325 |
+
Chunk 8 word count: 1943
|
| 326 |
+
Chunk 9 word count: 1000
|
| 327 |
+
Chunk 10 word count: 752
|
| 328 |
+
Chunk 11 word count: 445
|
| 329 |
+
Chunk 12 word count: 449
|
| 330 |
+
Chunk 13 word count: 401
|
| 331 |
+
Chunk 14 word count: 1294
|
| 332 |
+
Chunk 15 word count: 673
|
| 333 |
+
Chunk 16 word count: 392
|
| 334 |
+
|
| 335 |
+
Batch 7:
|
| 336 |
+
Chunk 1 word count: 56
|
| 337 |
+
Chunk 2 word count: 805
|
| 338 |
+
Chunk 3 word count: 440
|
| 339 |
+
Chunk 4 word count: 515
|
| 340 |
+
Chunk 5 word count: 338
|
| 341 |
+
Chunk 6 word count: 89
|
| 342 |
+
Chunk 7 word count: 5
|
| 343 |
+
Chunk 8 word count: 98
|
| 344 |
+
Chunk 9 word count: 350
|
| 345 |
+
Chunk 10 word count: 836
|
| 346 |
+
Chunk 11 word count: 645
|
| 347 |
+
Chunk 12 word count: 10
|
| 348 |
+
Chunk 13 word count: 463
|
| 349 |
+
Chunk 14 word count: 443
|
| 350 |
+
Chunk 15 word count: 781
|
| 351 |
+
Chunk 16 word count: 113
|
| 352 |
+
|
| 353 |
+
Batch 8:
|
| 354 |
+
Chunk 1 word count: 319
|
| 355 |
+
Chunk 2 word count: 439
|
| 356 |
+
Chunk 3 word count: 434
|
| 357 |
+
Chunk 4 word count: 1401
|
| 358 |
+
Chunk 5 word count: 323
|
| 359 |
+
Chunk 6 word count: 340
|
| 360 |
+
Chunk 7 word count: 320
|
| 361 |
+
Chunk 8 word count: 1129
|
| 362 |
+
Chunk 9 word count: 1853
|
| 363 |
+
Chunk 10 word count: 1986
|
| 364 |
+
Chunk 11 word count: 1996
|
| 365 |
+
Chunk 12 word count: 493
|
| 366 |
+
Chunk 13 word count: 348
|
| 367 |
+
Chunk 14 word count: 403
|
| 368 |
+
Chunk 15 word count: 781
|
| 369 |
+
Chunk 16 word count: 523
|
| 370 |
+
|
| 371 |
+
Batch 9:
|
| 372 |
+
Chunk 1 word count: 470
|
| 373 |
+
Chunk 2 word count: 374
|
| 374 |
+
Chunk 3 word count: 386
|
| 375 |
+
Chunk 4 word count: 412
|
| 376 |
+
Chunk 5 word count: 402
|
| 377 |
+
Chunk 6 word count: 419
|
| 378 |
+
Chunk 7 word count: 423
|
| 379 |
+
Chunk 8 word count: 1558
|
| 380 |
+
Chunk 9 word count: 402
|
| 381 |
+
Chunk 10 word count: 1727
|
| 382 |
+
Chunk 11 word count: 644
|
| 383 |
+
Chunk 12 word count: 298
|
| 384 |
+
Chunk 13 word count: 245
|
| 385 |
+
Chunk 14 word count: 3
|
| 386 |
+
Chunk 15 word count: 259
|
| 387 |
+
Chunk 16 word count: 1660
|
| 388 |
+
|
| 389 |
+
Batch 10:
|
| 390 |
+
Chunk 1 word count: 1802
|
| 391 |
+
Chunk 2 word count: 1802
|
| 392 |
+
Chunk 3 word count: 1879
|
| 393 |
+
Chunk 4 word count: 1601
|
| 394 |
+
Chunk 5 word count: 415
|
| 395 |
+
Chunk 6 word count: 3
|
| 396 |
+
Chunk 7 word count: 693
|
| 397 |
+
Chunk 8 word count: 318
|
| 398 |
+
Chunk 9 word count: 1491
|
| 399 |
+
Chunk 10 word count: 359
|
| 400 |
+
Chunk 11 word count: 364
|
| 401 |
+
Chunk 12 word count: 316
|
| 402 |
+
Chunk 13 word count: 158
|
| 403 |
+
Chunk 14 word count: 16
|
| 404 |
+
Chunk 15 word count: 320
|
| 405 |
+
Chunk 16 word count: 772
|
| 406 |
+
|
| 407 |
+
Batch 11:
|
| 408 |
+
Chunk 1 word count: 412
|
| 409 |
+
Chunk 2 word count: 1081
|
| 410 |
+
Chunk 3 word count: 295
|
| 411 |
+
Chunk 4 word count: 504
|
| 412 |
+
Chunk 5 word count: 7
|
| 413 |
+
Chunk 6 word count: 310
|
| 414 |
+
Chunk 7 word count: 715
|
| 415 |
+
Chunk 8 word count: 1987
|
| 416 |
+
Chunk 9 word count: 1836
|
| 417 |
+
Chunk 10 word count: 1945
|
| 418 |
+
Chunk 11 word count: 1869
|
| 419 |
+
Chunk 12 word count: 1707
|
| 420 |
+
Chunk 13 word count: 1809
|
| 421 |
+
Chunk 14 word count: 1795
|
| 422 |
+
Chunk 15 word count: 1958
|
| 423 |
+
Chunk 16 word count: 1722
|
| 424 |
+
|
| 425 |
+
Batch 12:
|
| 426 |
+
Chunk 1 word count: 1984
|
| 427 |
+
Chunk 2 word count: 1751
|
| 428 |
+
Chunk 3 word count: 304
|
| 429 |
+
Chunk 4 word count: 1926
|
| 430 |
+
Chunk 5 word count: 2000
|
| 431 |
+
Chunk 6 word count: 1642
|
| 432 |
+
Chunk 7 word count: 1991
|
| 433 |
+
Chunk 8 word count: 1843
|
| 434 |
+
Chunk 9 word count: 1139
|
| 435 |
+
Chunk 10 word count: 1966
|
| 436 |
+
Chunk 11 word count: 1846
|
| 437 |
+
Chunk 12 word count: 774
|
| 438 |
+
|
| 439 |
+
Summary:
|
| 440 |
+
Chunks split by level 1 headings (#): 127
|
| 441 |
+
Chunks split by grouped level 2 headings (##): 57
|
| 442 |
+
Chunks split by paragraphs: 0
|
| 443 |
+
Batch 1 embeddings received, total embeddings so far: 16
|
| 444 |
+
Batch 2 embeddings received, total embeddings so far: 32
|
| 445 |
+
Batch 3 embeddings received, total embeddings so far: 48
|
| 446 |
+
Batch 4 embeddings received, total embeddings so far: 64
|
| 447 |
+
Batch 5 embeddings received, total embeddings so far: 80
|
| 448 |
+
Batch 6 embeddings received, total embeddings so far: 96
|
| 449 |
+
Batch 7 embeddings received, total embeddings so far: 112
|
| 450 |
+
Batch 8 embeddings received, total embeddings so far: 128
|
| 451 |
+
Batch 9 embeddings received, total embeddings so far: 144
|
| 452 |
+
Batch 10 embeddings received, total embeddings so far: 160
|
| 453 |
+
Batch 11 embeddings received, total embeddings so far: 176
|
| 454 |
+
Batch 12 embeddings received, total embeddings so far: 188
|
| 455 |
+
Upserted points 0 to 19
|
| 456 |
+
Upserted points 20 to 39
|
| 457 |
+
Upserted points 40 to 59
|
| 458 |
+
Upserted points 60 to 79
|
| 459 |
+
Upserted points 80 to 99
|
| 460 |
+
Upserted points 100 to 119
|
| 461 |
+
Upserted points 120 to 139
|
| 462 |
+
Upserted points 140 to 159
|
| 463 |
+
Upserted points 160 to 179
|
| 464 |
+
Upserted points 180 to 187
|
| 465 |
+
Total upserted points: 188
|
| 466 |
+
processing complete
|
| 467 |
+
INFO: 10.16.21.252:54043 - "POST /api/v1/hackrx/run HTTP/1.1" 200 OK
|
| 468 |
+
https://hackrx.blob.core.windows.net/assets/indian_constitution.pdf?sv=2023-01-03&st=2025-07-28T06%3A42%3A00Z&se=2026-11-29T06%3A42%3A00Z&sr=b&sp=r&sig=5Gs%2FOXqP3zY00lgciu4BZjDV5QjTDIx7fgnfdz6Pu24%3D
|
| 469 |
+
['If my car is stolen, what case will it be in law?', 'If I am arrested without a warrant, is that legal?', 'If someone denies me a job because of my caste, is that allowed?', 'If the government takes my land for a project, can I stop it?', 'If my child is forced to work in a factory, is that legal?', 'If I am stopped from speaking at a protest, is that against my rights?', "If a religious place stops me from entering because I'm a woman, is that constitutional?", 'If I change my religion, can the government stop me?', 'If the police torture someone in custody, what right is being violated?', "If I'm denied admission to a public university because I'm from a backward community, can I do something?"]
|
| 470 |
+
delete done
|
| 471 |
+
|
| 472 |
+
Batch 1:
|
| 473 |
+
Chunk 1 word count: 1635
|
| 474 |
+
Chunk 2 word count: 1807
|
| 475 |
+
Chunk 3 word count: 74
|
| 476 |
+
Chunk 4 word count: 449
|
| 477 |
+
Chunk 5 word count: 875
|
| 478 |
+
Chunk 6 word count: 207
|
| 479 |
+
Chunk 7 word count: 19
|
| 480 |
+
Chunk 8 word count: 3
|
| 481 |
+
Chunk 9 word count: 180
|
| 482 |
+
Chunk 10 word count: 15
|
| 483 |
+
Chunk 11 word count: 7
|
| 484 |
+
Chunk 12 word count: 819
|
| 485 |
+
Chunk 13 word count: 3
|
| 486 |
+
Chunk 14 word count: 32
|
| 487 |
+
Chunk 15 word count: 19
|
| 488 |
+
Chunk 16 word count: 3
|
| 489 |
+
|
| 490 |
+
Batch 2:
|
| 491 |
+
Chunk 1 word count: 576
|
| 492 |
+
Chunk 2 word count: 119
|
| 493 |
+
Chunk 3 word count: 1989
|
| 494 |
+
Chunk 4 word count: 1809
|
| 495 |
+
Chunk 5 word count: 1786
|
| 496 |
+
Chunk 6 word count: 1789
|
| 497 |
+
Chunk 7 word count: 1995
|
| 498 |
+
Chunk 8 word count: 1834
|
| 499 |
+
Chunk 9 word count: 1948
|
| 500 |
+
Chunk 10 word count: 1735
|
| 501 |
+
Chunk 11 word count: 1779
|
| 502 |
+
Chunk 12 word count: 203
|
| 503 |
+
Chunk 13 word count: 1981
|
| 504 |
+
Chunk 14 word count: 1996
|
| 505 |
+
Chunk 15 word count: 1895
|
| 506 |
+
Chunk 16 word count: 1641
|
| 507 |
+
|
| 508 |
+
Batch 3:
|
| 509 |
+
Chunk 1 word count: 1513
|
| 510 |
+
Chunk 2 word count: 1559
|
| 511 |
+
Chunk 3 word count: 448
|
| 512 |
+
Chunk 4 word count: 732
|
| 513 |
+
Chunk 5 word count: 837
|
| 514 |
+
Chunk 6 word count: 409
|
| 515 |
+
Chunk 7 word count: 609
|
| 516 |
+
Chunk 8 word count: 1677
|
| 517 |
+
Chunk 9 word count: 1928
|
| 518 |
+
Chunk 10 word count: 674
|
| 519 |
+
Chunk 11 word count: 474
|
| 520 |
+
Chunk 12 word count: 436
|
| 521 |
+
Chunk 13 word count: 91
|
| 522 |
+
Chunk 14 word count: 345
|
| 523 |
+
Chunk 15 word count: 430
|
| 524 |
+
Chunk 16 word count: 1774
|
| 525 |
+
|
| 526 |
+
Batch 4:
|
| 527 |
+
Chunk 1 word count: 1076
|
| 528 |
+
Chunk 2 word count: 1596
|
| 529 |
+
Chunk 3 word count: 478
|
| 530 |
+
Chunk 4 word count: 704
|
| 531 |
+
Chunk 5 word count: 1711
|
| 532 |
+
Chunk 6 word count: 679
|
| 533 |
+
Chunk 7 word count: 436
|
| 534 |
+
Chunk 8 word count: 1462
|
| 535 |
+
Chunk 9 word count: 535
|
| 536 |
+
Chunk 10 word count: 296
|
| 537 |
+
Chunk 11 word count: 29
|
| 538 |
+
Chunk 12 word count: 348
|
| 539 |
+
Chunk 13 word count: 1236
|
| 540 |
+
Chunk 14 word count: 420
|
| 541 |
+
Chunk 15 word count: 843
|
| 542 |
+
Chunk 16 word count: 335
|
| 543 |
+
|
| 544 |
+
Batch 5:
|
| 545 |
+
Chunk 1 word count: 378
|
| 546 |
+
Chunk 2 word count: 448
|
| 547 |
+
Chunk 3 word count: 1090
|
| 548 |
+
Chunk 4 word count: 351
|
| 549 |
+
Chunk 5 word count: 411
|
| 550 |
+
Chunk 6 word count: 229
|
| 551 |
+
Chunk 7 word count: 336
|
| 552 |
+
Chunk 8 word count: 324
|
| 553 |
+
Chunk 9 word count: 397
|
| 554 |
+
Chunk 10 word count: 356
|
| 555 |
+
Chunk 11 word count: 314
|
| 556 |
+
Chunk 12 word count: 413
|
| 557 |
+
Chunk 13 word count: 286
|
| 558 |
+
Chunk 14 word count: 294
|
| 559 |
+
Chunk 15 word count: 238
|
| 560 |
+
Chunk 16 word count: 322
|
| 561 |
+
|
| 562 |
+
Batch 6:
|
| 563 |
+
Chunk 1 word count: 364
|
| 564 |
+
Chunk 2 word count: 345
|
| 565 |
+
Chunk 3 word count: 371
|
| 566 |
+
Chunk 4 word count: 339
|
| 567 |
+
Chunk 5 word count: 362
|
| 568 |
+
Chunk 6 word count: 356
|
| 569 |
+
Chunk 7 word count: 253
|
| 570 |
+
Chunk 8 word count: 1943
|
| 571 |
+
Chunk 9 word count: 1000
|
| 572 |
+
Chunk 10 word count: 752
|
| 573 |
+
Chunk 11 word count: 445
|
| 574 |
+
Chunk 12 word count: 449
|
| 575 |
+
Chunk 13 word count: 401
|
| 576 |
+
Chunk 14 word count: 1294
|
| 577 |
+
Chunk 15 word count: 673
|
| 578 |
+
Chunk 16 word count: 392
|
| 579 |
+
|
| 580 |
+
Batch 7:
|
| 581 |
+
Chunk 1 word count: 56
|
| 582 |
+
Chunk 2 word count: 805
|
| 583 |
+
Chunk 3 word count: 440
|
| 584 |
+
Chunk 4 word count: 515
|
| 585 |
+
Chunk 5 word count: 338
|
| 586 |
+
Chunk 6 word count: 89
|
| 587 |
+
Chunk 7 word count: 5
|
| 588 |
+
Chunk 8 word count: 98
|
| 589 |
+
Chunk 9 word count: 350
|
| 590 |
+
Chunk 10 word count: 836
|
| 591 |
+
Chunk 11 word count: 645
|
| 592 |
+
Chunk 12 word count: 10
|
| 593 |
+
Chunk 13 word count: 463
|
| 594 |
+
Chunk 14 word count: 443
|
| 595 |
+
Chunk 15 word count: 781
|
| 596 |
+
Chunk 16 word count: 113
|
| 597 |
+
|
| 598 |
+
Batch 8:
|
| 599 |
+
Chunk 1 word count: 319
|
| 600 |
+
Chunk 2 word count: 439
|
| 601 |
+
Chunk 3 word count: 434
|
| 602 |
+
Chunk 4 word count: 1401
|
| 603 |
+
Chunk 5 word count: 323
|
| 604 |
+
Chunk 6 word count: 340
|
| 605 |
+
Chunk 7 word count: 320
|
| 606 |
+
Chunk 8 word count: 1129
|
| 607 |
+
Chunk 9 word count: 1853
|
| 608 |
+
Chunk 10 word count: 1986
|
| 609 |
+
Chunk 11 word count: 1996
|
| 610 |
+
Chunk 12 word count: 493
|
| 611 |
+
Chunk 13 word count: 348
|
| 612 |
+
Chunk 14 word count: 403
|
| 613 |
+
Chunk 15 word count: 781
|
| 614 |
+
Chunk 16 word count: 523
|
| 615 |
+
|
| 616 |
+
Batch 9:
|
| 617 |
+
Chunk 1 word count: 470
|
| 618 |
+
Chunk 2 word count: 374
|
| 619 |
+
Chunk 3 word count: 386
|
| 620 |
+
Chunk 4 word count: 412
|
| 621 |
+
Chunk 5 word count: 402
|
| 622 |
+
Chunk 6 word count: 419
|
| 623 |
+
Chunk 7 word count: 423
|
| 624 |
+
Chunk 8 word count: 1558
|
| 625 |
+
Chunk 9 word count: 402
|
| 626 |
+
Chunk 10 word count: 1727
|
| 627 |
+
Chunk 11 word count: 644
|
| 628 |
+
Chunk 12 word count: 298
|
| 629 |
+
Chunk 13 word count: 245
|
| 630 |
+
Chunk 14 word count: 3
|
| 631 |
+
Chunk 15 word count: 259
|
| 632 |
+
Chunk 16 word count: 1660
|
| 633 |
+
|
| 634 |
+
Batch 10:
|
| 635 |
+
Chunk 1 word count: 1802
|
| 636 |
+
Chunk 2 word count: 1802
|
| 637 |
+
Chunk 3 word count: 1879
|
| 638 |
+
Chunk 4 word count: 1601
|
| 639 |
+
Chunk 5 word count: 415
|
| 640 |
+
Chunk 6 word count: 3
|
| 641 |
+
Chunk 7 word count: 693
|
| 642 |
+
Chunk 8 word count: 318
|
| 643 |
+
Chunk 9 word count: 1491
|
| 644 |
+
Chunk 10 word count: 359
|
| 645 |
+
Chunk 11 word count: 364
|
| 646 |
+
Chunk 12 word count: 316
|
| 647 |
+
Chunk 13 word count: 158
|
| 648 |
+
Chunk 14 word count: 16
|
| 649 |
+
Chunk 15 word count: 320
|
| 650 |
+
Chunk 16 word count: 772
|
| 651 |
+
|
| 652 |
+
Batch 11:
|
| 653 |
+
Chunk 1 word count: 412
|
| 654 |
+
Chunk 2 word count: 1081
|
| 655 |
+
Chunk 3 word count: 295
|
| 656 |
+
Chunk 4 word count: 504
|
| 657 |
+
Chunk 5 word count: 7
|
| 658 |
+
Chunk 6 word count: 310
|
| 659 |
+
Chunk 7 word count: 715
|
| 660 |
+
Chunk 8 word count: 1987
|
| 661 |
+
Chunk 9 word count: 1836
|
| 662 |
+
Chunk 10 word count: 1945
|
| 663 |
+
Chunk 11 word count: 1869
|
| 664 |
+
Chunk 12 word count: 1707
|
| 665 |
+
Chunk 13 word count: 1809
|
| 666 |
+
Chunk 14 word count: 1795
|
| 667 |
+
Chunk 15 word count: 1958
|
| 668 |
+
Chunk 16 word count: 1722
|
| 669 |
+
|
| 670 |
+
Batch 12:
|
| 671 |
+
Chunk 1 word count: 1984
|
| 672 |
+
Chunk 2 word count: 1751
|
| 673 |
+
Chunk 3 word count: 304
|
| 674 |
+
Chunk 4 word count: 1926
|
| 675 |
+
Chunk 5 word count: 2000
|
| 676 |
+
Chunk 6 word count: 1642
|
| 677 |
+
Chunk 7 word count: 1991
|
| 678 |
+
Chunk 8 word count: 1843
|
| 679 |
+
Chunk 9 word count: 1139
|
| 680 |
+
Chunk 10 word count: 1966
|
| 681 |
+
Chunk 11 word count: 1846
|
| 682 |
+
Chunk 12 word count: 774
|
| 683 |
+
|
| 684 |
+
Summary:
|
| 685 |
+
Chunks split by level 1 headings (#): 127
|
| 686 |
+
Chunks split by grouped level 2 headings (##): 57
|
| 687 |
+
Chunks split by paragraphs: 0
|
| 688 |
+
Batch 1 embeddings received, total embeddings so far: 16
|
| 689 |
+
Batch 2 embeddings received, total embeddings so far: 32
|
| 690 |
+
Batch 3 embeddings received, total embeddings so far: 48
|
| 691 |
+
Batch 4 embeddings received, total embeddings so far: 64
|
| 692 |
+
Batch 5 embeddings received, total embeddings so far: 80
|
| 693 |
+
Batch 6 embeddings received, total embeddings so far: 96
|
| 694 |
+
Batch 7 embeddings received, total embeddings so far: 112
|
| 695 |
+
Batch 8 embeddings received, total embeddings so far: 128
|
| 696 |
+
Batch 9 embeddings received, total embeddings so far: 144
|
| 697 |
+
Batch 10 embeddings received, total embeddings so far: 160
|
| 698 |
+
Batch 11 embeddings received, total embeddings so far: 176
|
| 699 |
+
Batch 12 embeddings received, total embeddings so far: 188
|
| 700 |
+
Upserted points 0 to 19
|
| 701 |
+
Upserted points 20 to 39
|
| 702 |
+
Upserted points 40 to 59
|
| 703 |
+
Upserted points 60 to 79
|
| 704 |
+
Upserted points 80 to 99
|
| 705 |
+
Upserted points 100 to 119
|
| 706 |
+
Upserted points 120 to 139
|
| 707 |
+
Upserted points 140 to 159
|
| 708 |
+
Upserted points 160 to 179
|
| 709 |
+
Upserted points 180 to 187
|
| 710 |
+
Total upserted points: 188
|
| 711 |
+
processing complete
|
| 712 |
+
INFO: 10.16.21.252:20306 - "POST /api/v1/hackrx/run HTTP/1.1" 200 OK
|
| 713 |
+
https://hackrx.blob.core.windows.net/assets/principia_newton.pdf?sv=2023-01-03&st=2025-07-28T07%3A20%3A32Z&se=2026-07-29T07%3A20%3A00Z&sr=b&sp=r&sig=V5I1QYyigoxeUMbnUKsdEaST99F5%2FDfo7wpKg9XXF5w%3D
|
| 714 |
+
["How does Newton define 'quantity of motion' and how is it distinct from 'force'?", 'According to Newton, what are the three laws of motion and how do they apply in celestial mechanics?', "How does Newton derive Kepler's Second Law (equal areas in equal times) from his laws of motion and gravitation?", 'How does Newton demonstrate that gravity is inversely proportional to the square of the distance between two masses?', "What is Newton's argument for why gravitational force must act on all masses universally?", 'How does Newton explain the perturbation of planetary orbits due to other planets?', "What mathematical tools did Newton use in Principia that were precursors to calculus, and why didn't he use standard calculus notation?", 'How does Newton use the concept of centripetal force to explain orbital motion?', 'How does Newton handle motion in resisting media, such as air or fluids?', "In what way does Newton's notion of absolute space and time differ from relative motion, and how does it support his laws?", 'Who was the grandfather of Isaac Newton?', 'Do we know any other descent of Isaac Newton apart from his grandfather?']
|
| 715 |
+
|
| 716 |
+
Batch 1:
|
| 717 |
+
Chunk 1 word count: 122
|
| 718 |
+
Chunk 2 word count: 1424
|
| 719 |
+
Chunk 3 word count: 1994
|
| 720 |
+
Chunk 4 word count: 202
|
| 721 |
+
Chunk 5 word count: 1945
|
| 722 |
+
Chunk 6 word count: 1983
|
| 723 |
+
Chunk 7 word count: 202
|
| 724 |
+
Chunk 8 word count: 1877
|
| 725 |
+
Chunk 9 word count: 202
|
| 726 |
+
Chunk 10 word count: 1967
|
| 727 |
+
Chunk 11 word count: 202
|
| 728 |
+
Chunk 12 word count: 1862
|
| 729 |
+
Chunk 13 word count: 202
|
| 730 |
+
Chunk 14 word count: 1860
|
| 731 |
+
Chunk 15 word count: 202
|
| 732 |
+
Chunk 16 word count: 1887
|
| 733 |
+
|
| 734 |
+
Batch 2:
|
| 735 |
+
Chunk 1 word count: 202
|
| 736 |
+
Chunk 2 word count: 1988
|
| 737 |
+
Chunk 3 word count: 202
|
| 738 |
+
Chunk 4 word count: 1998
|
| 739 |
+
Chunk 5 word count: 202
|
| 740 |
+
Chunk 6 word count: 1958
|
| 741 |
+
Chunk 7 word count: 1915
|
| 742 |
+
Chunk 8 word count: 2
|
| 743 |
+
Chunk 9 word count: 1986
|
| 744 |
+
Chunk 10 word count: 318
|
| 745 |
+
Chunk 11 word count: 1933
|
| 746 |
+
Chunk 12 word count: 1606
|
| 747 |
+
Chunk 13 word count: 1992
|
| 748 |
+
Chunk 14 word count: 203
|
| 749 |
+
Chunk 15 word count: 1901
|
| 750 |
+
Chunk 16 word count: 203
|
| 751 |
+
|
| 752 |
+
Batch 3:
|
| 753 |
+
Chunk 1 word count: 1899
|
| 754 |
+
Chunk 2 word count: 1202
|
| 755 |
+
Chunk 3 word count: 1948
|
| 756 |
+
Chunk 4 word count: 203
|
| 757 |
+
Chunk 5 word count: 1919
|
| 758 |
+
Chunk 6 word count: 203
|
| 759 |
+
Chunk 7 word count: 1948
|
| 760 |
+
Chunk 8 word count: 1682
|
| 761 |
+
Chunk 9 word count: 1784
|
| 762 |
+
Chunk 10 word count: 1784
|
| 763 |
+
Chunk 11 word count: 1984
|
| 764 |
+
Chunk 12 word count: 1974
|
| 765 |
+
Chunk 13 word count: 1046
|
| 766 |
+
Chunk 14 word count: 1850
|
| 767 |
+
Chunk 15 word count: 1966
|
| 768 |
+
Chunk 16 word count: 1225
|
| 769 |
+
|
| 770 |
+
Batch 4:
|
| 771 |
+
Chunk 1 word count: 1488
|
| 772 |
+
Chunk 2 word count: 1488
|
| 773 |
+
Chunk 3 word count: 1781
|
| 774 |
+
Chunk 4 word count: 1951
|
| 775 |
+
Chunk 5 word count: 1804
|
| 776 |
+
Chunk 6 word count: 203
|
| 777 |
+
Chunk 7 word count: 1998
|
| 778 |
+
Chunk 8 word count: 1885
|
| 779 |
+
Chunk 9 word count: 1623
|
| 780 |
+
Chunk 10 word count: 1933
|
| 781 |
+
Chunk 11 word count: 838
|
| 782 |
+
Chunk 12 word count: 1498
|
| 783 |
+
Chunk 13 word count: 796
|
| 784 |
+
Chunk 14 word count: 1159
|
| 785 |
+
Chunk 15 word count: 389
|
| 786 |
+
Chunk 16 word count: 19
|
| 787 |
+
|
| 788 |
+
Batch 5:
|
| 789 |
+
Chunk 1 word count: 336
|
| 790 |
+
Chunk 2 word count: 24
|
| 791 |
+
Chunk 3 word count: 655
|
| 792 |
+
Chunk 4 word count: 127
|
| 793 |
+
Chunk 5 word count: 108
|
| 794 |
+
Chunk 6 word count: 902
|
| 795 |
+
Chunk 7 word count: 1110
|
| 796 |
+
Chunk 8 word count: 1119
|
| 797 |
+
Chunk 9 word count: 1382
|
| 798 |
+
Chunk 10 word count: 393
|
| 799 |
+
Chunk 11 word count: 241
|
| 800 |
+
Chunk 12 word count: 169
|
| 801 |
+
Chunk 13 word count: 918
|
| 802 |
+
Chunk 14 word count: 169
|
| 803 |
+
Chunk 15 word count: 283
|
| 804 |
+
Chunk 16 word count: 381
|
| 805 |
+
|
| 806 |
+
Batch 6:
|
| 807 |
+
Chunk 1 word count: 139
|
| 808 |
+
Chunk 2 word count: 267
|
| 809 |
+
Chunk 3 word count: 675
|
| 810 |
+
Chunk 4 word count: 1037
|
| 811 |
+
Chunk 5 word count: 1900
|
| 812 |
+
Chunk 6 word count: 1973
|
| 813 |
+
Chunk 7 word count: 1949
|
| 814 |
+
Chunk 8 word count: 1325
|
| 815 |
+
Chunk 9 word count: 167
|
| 816 |
+
Chunk 10 word count: 567
|
| 817 |
+
Chunk 11 word count: 580
|
| 818 |
+
Chunk 12 word count: 336
|
| 819 |
+
Chunk 13 word count: 1436
|
| 820 |
+
Chunk 14 word count: 86
|
| 821 |
+
Chunk 15 word count: 269
|
| 822 |
+
Chunk 16 word count: 384
|
| 823 |
+
|
| 824 |
+
Batch 7:
|
| 825 |
+
Chunk 1 word count: 655
|
| 826 |
+
Chunk 2 word count: 840
|
| 827 |
+
Chunk 3 word count: 154
|
| 828 |
+
Chunk 4 word count: 372
|
| 829 |
+
Chunk 5 word count: 432
|
| 830 |
+
Chunk 6 word count: 567
|
| 831 |
+
Chunk 7 word count: 1018
|
| 832 |
+
Chunk 8 word count: 664
|
| 833 |
+
Chunk 9 word count: 269
|
| 834 |
+
Chunk 10 word count: 126
|
| 835 |
+
Chunk 11 word count: 81
|
| 836 |
+
Chunk 12 word count: 16
|
| 837 |
+
Chunk 13 word count: 308
|
| 838 |
+
Chunk 14 word count: 1339
|
| 839 |
+
Chunk 15 word count: 648
|
| 840 |
+
Chunk 16 word count: 1052
|
| 841 |
+
|
| 842 |
+
Batch 8:
|
| 843 |
+
Chunk 1 word count: 111
|
| 844 |
+
Chunk 2 word count: 784
|
| 845 |
+
Chunk 3 word count: 438
|
| 846 |
+
Chunk 4 word count: 25
|
| 847 |
+
Chunk 5 word count: 1435
|
| 848 |
+
Chunk 6 word count: 1584
|
| 849 |
+
Chunk 7 word count: 1908
|
| 850 |
+
Chunk 8 word count: 1707
|
| 851 |
+
Chunk 9 word count: 1884
|
| 852 |
+
Chunk 10 word count: 1521
|
| 853 |
+
Chunk 11 word count: 803
|
| 854 |
+
Chunk 12 word count: 1569
|
| 855 |
+
Chunk 13 word count: 203
|
| 856 |
+
Chunk 14 word count: 1924
|
| 857 |
+
Chunk 15 word count: 203
|
| 858 |
+
Chunk 16 word count: 1949
|
| 859 |
+
|
| 860 |
+
Batch 9:
|
| 861 |
+
Chunk 1 word count: 1870
|
| 862 |
+
Chunk 2 word count: 1714
|
| 863 |
+
Chunk 3 word count: 1790
|
| 864 |
+
Chunk 4 word count: 203
|
| 865 |
+
Chunk 5 word count: 1804
|
| 866 |
+
Chunk 6 word count: 884
|
| 867 |
+
Chunk 7 word count: 40
|
| 868 |
+
Chunk 8 word count: 856
|
| 869 |
+
Chunk 9 word count: 771
|
| 870 |
+
Chunk 10 word count: 1719
|
| 871 |
+
Chunk 11 word count: 354
|
| 872 |
+
Chunk 12 word count: 11
|
| 873 |
+
Chunk 13 word count: 564
|
| 874 |
+
Chunk 14 word count: 980
|
| 875 |
+
Chunk 15 word count: 1489
|
| 876 |
+
Chunk 16 word count: 1892
|
| 877 |
+
|
| 878 |
+
Batch 10:
|
| 879 |
+
Chunk 1 word count: 1821
|
| 880 |
+
Chunk 2 word count: 1947
|
| 881 |
+
Chunk 3 word count: 1550
|
| 882 |
+
Chunk 4 word count: 3
|
| 883 |
+
Chunk 5 word count: 757
|
| 884 |
+
Chunk 6 word count: 1580
|
| 885 |
+
Chunk 7 word count: 1243
|
| 886 |
+
Chunk 8 word count: 213
|
| 887 |
+
Chunk 9 word count: 1939
|
| 888 |
+
Chunk 10 word count: 213
|
| 889 |
+
Chunk 11 word count: 1894
|
| 890 |
+
Chunk 12 word count: 1737
|
| 891 |
+
Chunk 13 word count: 1599
|
| 892 |
+
Chunk 14 word count: 1387
|
| 893 |
+
Chunk 15 word count: 959
|
| 894 |
+
Chunk 16 word count: 94
|
| 895 |
+
|
| 896 |
+
Batch 11:
|
| 897 |
+
Chunk 1 word count: 2000
|
| 898 |
+
Chunk 2 word count: 1
|
| 899 |
+
Chunk 3 word count: 213
|
| 900 |
+
Chunk 4 word count: 1869
|
| 901 |
+
Chunk 5 word count: 213
|
| 902 |
+
Chunk 6 word count: 1896
|
| 903 |
+
Chunk 7 word count: 613
|
| 904 |
+
Chunk 8 word count: 1463
|
| 905 |
+
Chunk 9 word count: 1541
|
| 906 |
+
Chunk 10 word count: 481
|
| 907 |
+
Chunk 11 word count: 1447
|
| 908 |
+
Chunk 12 word count: 791
|
| 909 |
+
Chunk 13 word count: 213
|
| 910 |
+
Chunk 14 word count: 1911
|
| 911 |
+
Chunk 15 word count: 361
|
| 912 |
+
Chunk 16 word count: 44
|
| 913 |
+
|
| 914 |
+
Batch 12:
|
| 915 |
+
Chunk 1 word count: 915
|
| 916 |
+
Chunk 2 word count: 1995
|
| 917 |
+
Chunk 3 word count: 414
|
| 918 |
+
Chunk 4 word count: 817
|
| 919 |
+
Chunk 5 word count: 446
|
| 920 |
+
Chunk 6 word count: 545
|
| 921 |
+
Chunk 7 word count: 348
|
| 922 |
+
Chunk 8 word count: 869
|
| 923 |
+
Chunk 9 word count: 89
|
| 924 |
+
Chunk 10 word count: 1920
|
| 925 |
+
Chunk 11 word count: 206
|
| 926 |
+
Chunk 12 word count: 1951
|
| 927 |
+
Chunk 13 word count: 1993
|
| 928 |
+
Chunk 14 word count: 1704
|
| 929 |
+
Chunk 15 word count: 1975
|
| 930 |
+
Chunk 16 word count: 1240
|
| 931 |
+
|
| 932 |
+
Batch 13:
|
| 933 |
+
Chunk 1 word count: 173
|
| 934 |
+
Chunk 2 word count: 1996
|
| 935 |
+
Chunk 3 word count: 1924
|
| 936 |
+
Chunk 4 word count: 1370
|
| 937 |
+
Chunk 5 word count: 1716
|
| 938 |
+
Chunk 6 word count: 1596
|
| 939 |
+
Chunk 7 word count: 1973
|
| 940 |
+
Chunk 8 word count: 1631
|
| 941 |
+
Chunk 9 word count: 900
|
| 942 |
+
Chunk 10 word count: 206
|
| 943 |
+
Chunk 11 word count: 1993
|
| 944 |
+
Chunk 12 word count: 206
|
| 945 |
+
Chunk 13 word count: 1972
|
| 946 |
+
Chunk 14 word count: 1522
|
| 947 |
+
Chunk 15 word count: 788
|
| 948 |
+
Chunk 16 word count: 75
|
| 949 |
+
|
| 950 |
+
Batch 14:
|
| 951 |
+
Chunk 1 word count: 1945
|
| 952 |
+
Chunk 2 word count: 1607
|
| 953 |
+
Chunk 3 word count: 206
|
| 954 |
+
Chunk 4 word count: 1810
|
| 955 |
+
Chunk 5 word count: 1309
|
| 956 |
+
Chunk 6 word count: 206
|
| 957 |
+
Chunk 7 word count: 1892
|
| 958 |
+
Chunk 8 word count: 1903
|
| 959 |
+
Chunk 9 word count: 1391
|
| 960 |
+
Chunk 10 word count: 1006
|
| 961 |
+
Chunk 11 word count: 206
|
| 962 |
+
Chunk 12 word count: 1951
|
| 963 |
+
Chunk 13 word count: 1935
|
| 964 |
+
Chunk 14 word count: 1933
|
| 965 |
+
Chunk 15 word count: 545
|
| 966 |
+
Chunk 16 word count: 206
|
| 967 |
+
|
| 968 |
+
Batch 15:
|
| 969 |
+
Chunk 1 word count: 1802
|
| 970 |
+
Chunk 2 word count: 206
|
| 971 |
+
Chunk 3 word count: 1882
|
| 972 |
+
Chunk 4 word count: 206
|
| 973 |
+
Chunk 5 word count: 1961
|
| 974 |
+
Chunk 6 word count: 206
|
| 975 |
+
Chunk 7 word count: 1843
|
| 976 |
+
Chunk 8 word count: 206
|
| 977 |
+
Chunk 9 word count: 1865
|
| 978 |
+
Chunk 10 word count: 206
|
| 979 |
+
Chunk 11 word count: 1812
|
| 980 |
+
Chunk 12 word count: 1994
|
| 981 |
+
Chunk 13 word count: 956
|
| 982 |
+
Chunk 14 word count: 206
|
| 983 |
+
Chunk 15 word count: 1948
|
| 984 |
+
Chunk 16 word count: 206
|
| 985 |
+
|
| 986 |
+
Batch 16:
|
| 987 |
+
Chunk 1 word count: 1854
|
| 988 |
+
Chunk 2 word count: 1789
|
| 989 |
+
Chunk 3 word count: 1996
|
| 990 |
+
Chunk 4 word count: 744
|
| 991 |
+
Chunk 5 word count: 32
|
| 992 |
+
Chunk 6 word count: 1991
|
| 993 |
+
Chunk 7 word count: 206
|
| 994 |
+
Chunk 8 word count: 2000
|
| 995 |
+
Chunk 9 word count: 1
|
| 996 |
+
Chunk 10 word count: 206
|
| 997 |
+
Chunk 11 word count: 1992
|
| 998 |
+
Chunk 12 word count: 206
|
| 999 |
+
Chunk 13 word count: 1970
|
| 1000 |
+
Chunk 14 word count: 1934
|
| 1001 |
+
Chunk 15 word count: 1962
|
| 1002 |
+
Chunk 16 word count: 206
|
| 1003 |
+
|
| 1004 |
+
Batch 17:
|
| 1005 |
+
Chunk 1 word count: 1956
|
| 1006 |
+
Chunk 2 word count: 206
|
| 1007 |
+
Chunk 3 word count: 1842
|
| 1008 |
+
Chunk 4 word count: 206
|
| 1009 |
+
Chunk 5 word count: 1916
|
| 1010 |
+
Chunk 6 word count: 206
|
| 1011 |
+
Chunk 7 word count: 1896
|
| 1012 |
+
Chunk 8 word count: 206
|
| 1013 |
+
Chunk 9 word count: 1942
|
| 1014 |
+
Chunk 10 word count: 1998
|
| 1015 |
+
Chunk 11 word count: 206
|
| 1016 |
+
Chunk 12 word count: 1802
|
| 1017 |
+
Chunk 13 word count: 1959
|
| 1018 |
+
Chunk 14 word count: 206
|
| 1019 |
+
Chunk 15 word count: 1795
|
| 1020 |
+
Chunk 16 word count: 206
|
| 1021 |
+
|
| 1022 |
+
Batch 18:
|
| 1023 |
+
Chunk 1 word count: 1909
|
| 1024 |
+
Chunk 2 word count: 206
|
| 1025 |
+
Chunk 3 word count: 1962
|
| 1026 |
+
Chunk 4 word count: 1212
|
| 1027 |
+
Chunk 5 word count: 1713
|
| 1028 |
+
Chunk 6 word count: 206
|
| 1029 |
+
Chunk 7 word count: 2000
|
| 1030 |
+
Chunk 8 word count: 3
|
| 1031 |
+
Chunk 9 word count: 206
|
| 1032 |
+
Chunk 10 word count: 1957
|
| 1033 |
+
Chunk 11 word count: 610
|
| 1034 |
+
Chunk 12 word count: 158
|
| 1035 |
+
|
| 1036 |
+
Summary:
|
| 1037 |
+
Chunks split by level 1 headings (#): 70
|
| 1038 |
+
Chunks split by grouped level 2 headings (##): 146
|
| 1039 |
+
Chunks split by paragraphs: 9
|
| 1040 |
+
Batch 1 embeddings received, total embeddings so far: 16
|
| 1041 |
+
Batch 2 embeddings received, total embeddings so far: 32
|
| 1042 |
+
Batch 3 embeddings received, total embeddings so far: 48
|
| 1043 |
+
Batch 4 embeddings received, total embeddings so far: 64
|
| 1044 |
+
Batch 5 embeddings received, total embeddings so far: 80
|
| 1045 |
+
Batch 6 embeddings received, total embeddings so far: 96
|
| 1046 |
+
Batch 7 embeddings received, total embeddings so far: 112
|
| 1047 |
+
Batch 8 embeddings received, total embeddings so far: 128
|
| 1048 |
+
Batch 9 embeddings received, total embeddings so far: 144
|
| 1049 |
+
Batch 10 embeddings received, total embeddings so far: 160
|
| 1050 |
+
Batch 11 embeddings received, total embeddings so far: 176
|
| 1051 |
+
Batch 12 embeddings received, total embeddings so far: 192
|
| 1052 |
+
Batch 13 embeddings received, total embeddings so far: 208
|
| 1053 |
+
Batch 14 embeddings received, total embeddings so far: 224
|
| 1054 |
+
Batch 15 embeddings received, total embeddings so far: 240
|
| 1055 |
+
Batch 16 embeddings received, total embeddings so far: 256
|
| 1056 |
+
Batch 17 embeddings received, total embeddings so far: 272
|
| 1057 |
+
Batch 18 embeddings received, total embeddings so far: 284
|
| 1058 |
+
Upserted points 0 to 19
|
| 1059 |
+
Upserted points 20 to 39
|
| 1060 |
+
Upserted points 40 to 59
|
| 1061 |
+
Upserted points 60 to 79
|
| 1062 |
+
Upserted points 80 to 99
|
| 1063 |
+
Upserted points 100 to 119
|
| 1064 |
+
Upserted points 120 to 139
|
| 1065 |
+
Upserted points 140 to 159
|
| 1066 |
+
Upserted points 160 to 179
|
| 1067 |
+
Upserted points 180 to 199
|
| 1068 |
+
Upserted points 200 to 219
|
| 1069 |
+
Upserted points 220 to 239
|
| 1070 |
+
Upserted points 240 to 259
|
| 1071 |
+
Upserted points 260 to 279
|
| 1072 |
+
Upserted points 280 to 283
|
| 1073 |
+
Total upserted points: 284
|
| 1074 |
+
processing complete
|
| 1075 |
+
INFO: 10.16.32.117:37373 - "POST /api/v1/hackrx/run HTTP/1.1" 200 OK
|
| 1076 |
+
ERROR: Exception in ASGI application
|
| 1077 |
+
Traceback (most recent call last):
|
| 1078 |
+
File "/home/user/.local/lib/python3.9/site-packages/uvicorn/protocols/http/h11_impl.py", line 403, in run_asgi
|
| 1079 |
+
result = await app( # type: ignore[func-returns-value]
|
| 1080 |
+
File "/home/user/.local/lib/python3.9/site-packages/uvicorn/middleware/proxy_headers.py", line 60, in __call__
|
| 1081 |
+
return await self.app(scope, receive, send)
|
| 1082 |
+
File "/home/user/.local/lib/python3.9/site-packages/fastapi/applications.py", line 1054, in __call__
|
| 1083 |
+
await super().__call__(scope, receive, send)
|
| 1084 |
+
File "/home/user/.local/lib/python3.9/site-packages/starlette/applications.py", line 113, in __call__
|
| 1085 |
+
await self.middleware_stack(scope, receive, send)
|
| 1086 |
+
File "/home/user/.local/lib/python3.9/site-packages/starlette/middleware/errors.py", line 186, in __call__
|
| 1087 |
+
raise exc
|
| 1088 |
+
File "/home/user/.local/lib/python3.9/site-packages/starlette/middleware/errors.py", line 164, in __call__
|
| 1089 |
+
await self.app(scope, receive, _send)
|
| 1090 |
+
File "/home/user/.local/lib/python3.9/site-packages/starlette/middleware/cors.py", line 85, in __call__
|
| 1091 |
+
await self.app(scope, receive, send)
|
| 1092 |
+
File "/home/user/.local/lib/python3.9/site-packages/starlette/middleware/exceptions.py", line 63, in __call__
|
| 1093 |
+
await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
|
| 1094 |
+
File "/home/user/.local/lib/python3.9/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
|
| 1095 |
+
raise exc
|
| 1096 |
+
File "/home/user/.local/lib/python3.9/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app
|
| 1097 |
+
await app(scope, receive, sender)
|
| 1098 |
+
File "/home/user/.local/lib/python3.9/site-packages/starlette/routing.py", line 716, in __call__
|
| 1099 |
+
await self.middleware_stack(scope, receive, send)
|
| 1100 |
+
File "/home/user/.local/lib/python3.9/site-packages/starlette/routing.py", line 736, in app
|
| 1101 |
+
await route.handle(scope, receive, send)
|
| 1102 |
+
File "/home/user/.local/lib/python3.9/site-packages/starlette/routing.py", line 290, in handle
|
| 1103 |
+
await self.app(scope, receive, send)
|
| 1104 |
+
File "/home/user/.local/lib/python3.9/site-packages/starlette/routing.py", line 78, in app
|
| 1105 |
+
await wrap_app_handling_exceptions(app, request)(scope, receive, send)
|
| 1106 |
+
File "/home/user/.local/lib/python3.9/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
|
| 1107 |
+
raise exc
|
| 1108 |
+
File "/home/user/.local/lib/python3.9/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app
|
| 1109 |
+
await app(scope, receive, sender)
|
| 1110 |
+
File "/home/user/.local/lib/python3.9/site-packages/starlette/routing.py", line 76, in app
|
| 1111 |
+
await response(scope, receive, send)
|
| 1112 |
+
File "/home/user/.local/lib/python3.9/site-packages/starlette/responses.py", line 168, in __call__
|
| 1113 |
+
await self.background()
|
| 1114 |
+
File "/home/user/.local/lib/python3.9/site-packages/starlette/background.py", line 42, in __call__
|
| 1115 |
+
await task()
|
| 1116 |
+
File "/home/user/.local/lib/python3.9/site-packages/starlette/background.py", line 29, in __call__
|
| 1117 |
+
await run_in_threadpool(self.func, *self.args, **self.kwargs)
|
| 1118 |
+
File "/home/user/.local/lib/python3.9/site-packages/starlette/concurrency.py", line 38, in run_in_threadpool
|
| 1119 |
+
return await anyio.to_thread.run_sync(func)
|
| 1120 |
+
File "/home/user/.local/lib/python3.9/site-packages/anyio/to_thread.py", line 56, in run_sync
|
| 1121 |
+
return await get_async_backend().run_sync_in_worker_thread(
|
| 1122 |
+
File "/home/user/.local/lib/python3.9/site-packages/anyio/_backends/_asyncio.py", line 2470, in run_sync_in_worker_thread
|
| 1123 |
+
return await future
|
| 1124 |
+
File "/home/user/.local/lib/python3.9/site-packages/anyio/_backends/_asyncio.py", line 967, in run
|
| 1125 |
+
result = context.run(func, *args)
|
| 1126 |
+
File "/app/qdrant_setup.py", line 83, in clear_collection_payloads
|
| 1127 |
+
client.create_collection(
|
| 1128 |
+
File "/home/user/.local/lib/python3.9/site-packages/qdrant_client/qdrant_client.py", line 2382, in create_collection
|
| 1129 |
+
return self._client.create_collection(
|
| 1130 |
+
File "/home/user/.local/lib/python3.9/site-packages/qdrant_client/qdrant_remote.py", line 2815, in create_collection
|
| 1131 |
+
result: Optional[bool] = self.http.collections_api.create_collection(
|
| 1132 |
+
File "/home/user/.local/lib/python3.9/site-packages/qdrant_client/http/api/collections_api.py", line 294, in create_collection
|
| 1133 |
+
return self._build_for_create_collection(
|
| 1134 |
+
File "/home/user/.local/lib/python3.9/site-packages/qdrant_client/http/api/collections_api.py", line 96, in _build_for_create_collection
|
| 1135 |
+
return self.api_client.request(
|
| 1136 |
+
File "/home/user/.local/lib/python3.9/site-packages/qdrant_client/http/api_client.py", line 95, in request
|
| 1137 |
+
return self.send(request, type_)
|
| 1138 |
+
File "/home/user/.local/lib/python3.9/site-packages/qdrant_client/http/api_client.py", line 130, in send
|
| 1139 |
+
raise UnexpectedResponse.for_response(response)
|
| 1140 |
+
qdrant_client.http.exceptions.UnexpectedResponse: Unexpected Response: 409 (Conflict)
|
| 1141 |
+
Raw response content:
|
| 1142 |
+
b'{"status":{"error":"Wrong input: Collection `test` already exists!"},"time":0.128498264}'
|
| 1143 |
+
|
| 1144 |
+
Want to edit your Spaces's metadata? Head to the README.md and metadata UI instead.
|
| 1145 |
+
Space Hardware
|
| 1146 |
+
Display price:
|
| 1147 |
+
|
| 1148 |
+
|
| 1149 |
+
per hour
|
| 1150 |
+
per month
|
| 1151 |
+
Choose a hardware for your Space.
|
| 1152 |
+
|
| 1153 |
+
You'll be billed on a per minute basis.
|
| 1154 |
+
View usage in your billing settings.
|
| 1155 |
+
|
| 1156 |
+
Sleep time settings
|
| 1157 |
+
Sleep after
|
| 1158 |
+
48 hours
|
| 1159 |
+
of inactivity
|
| 1160 |
+
Upgrade to a paid Hardware to set a custom sleep time.
|
| 1161 |
+
|
| 1162 |
+
Pause Space
|
| 1163 |
+
|
| 1164 |
+
Building something cool as a side project?
|
| 1165 |
+
Apply for a community GPU grant.
|
| 1166 |
+
|
| 1167 |
+
|
| 1168 |
+
CPU basic
|
| 1169 |
+
2 vCPU
|
| 1170 |
+
路
|
| 1171 |
+
16 GB RAM
|
| 1172 |
+
Current 路 Free
|
| 1173 |
+
|
| 1174 |
+
|
| 1175 |
+
CPU upgrade
|
| 1176 |
+
8 vCPU
|
| 1177 |
+
路
|
| 1178 |
+
32 GB RAM
|
| 1179 |
+
$0.03/hour
|
| 1180 |
+
|
| 1181 |
+
ZeroGPU
|
| 1182 |
+
Dynamic resources
|
| 1183 |
+
路
|
| 1184 |
+
Gradio only
|
| 1185 |
+
Free
|
| 1186 |
+
|
| 1187 |
+
Nvidia T4 small
|
| 1188 |
+
4 vCPU
|
| 1189 |
+
路
|
| 1190 |
+
15 GB RAM
|
| 1191 |
+
路
|
| 1192 |
+
16 GB VRAM
|
| 1193 |
+
$0.40/hour
|
| 1194 |
+
|
| 1195 |
+
Nvidia T4 medium
|
| 1196 |
+
8 vCPU
|
| 1197 |
+
路
|
| 1198 |
+
30 GB RAM
|
| 1199 |
+
路
|
| 1200 |
+
16 GB VRAM
|
| 1201 |
+
$0.60/hour
|
| 1202 |
+
|
| 1203 |
+
Nvidia 1xL4
|
| 1204 |
+
8 vCPU
|
| 1205 |
+
路
|
| 1206 |
+
30 GB RAM
|
| 1207 |
+
路
|
| 1208 |
+
24 GB VRAM
|
| 1209 |
+
$0.80/hour
|
| 1210 |
+
|
| 1211 |
+
Nvidia 4xL4
|
| 1212 |
+
48 vCPU
|
| 1213 |
+
路
|
| 1214 |
+
186 GB RAM
|
| 1215 |
+
路
|
| 1216 |
+
96 GB VRAM
|
| 1217 |
+
$3.80/hour
|
| 1218 |
+
|
| 1219 |
+
Nvidia 1xL40S
|
| 1220 |
+
8 vCPU
|
| 1221 |
+
路
|
| 1222 |
+
62 GB RAM
|
| 1223 |
+
路
|
| 1224 |
+
48 GB VRAM
|
| 1225 |
+
$1.80/hour
|
| 1226 |
+
|
| 1227 |
+
Nvidia 4xL40S
|
| 1228 |
+
48 vCPU
|
| 1229 |
+
路
|
| 1230 |
+
382 GB RAM
|
| 1231 |
+
路
|
| 1232 |
+
192 GB VRAM
|
| 1233 |
+
$8.30/hour
|
| 1234 |
+
|
| 1235 |
+
Nvidia 8xL40S
|
| 1236 |
+
192 vCPU
|
| 1237 |
+
路
|
| 1238 |
+
1534 GB RAM
|
| 1239 |
+
路
|
| 1240 |
+
384 GB VRAM
|
| 1241 |
+
$23.50/hour
|
| 1242 |
+
|
| 1243 |
+
Nvidia A10G small
|
| 1244 |
+
4 vCPU
|
| 1245 |
+
路
|
| 1246 |
+
15 GB RAM
|
| 1247 |
+
路
|
| 1248 |
+
24 GB VRAM
|
| 1249 |
+
$1.00/hour
|
tokenizer.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, List, Tuple
|
| 2 |
+
|
| 3 |
+
from tiktoken import get_encoding
|
| 4 |
+
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
# Create a wrapper class to make OpenAI's tokenizer compatible with the HybridChunker interface
|
| 8 |
+
class OpenAITokenizerWrapper(PreTrainedTokenizerBase):
|
| 9 |
+
"""Minimal wrapper for OpenAI's tokenizer."""
|
| 10 |
+
|
| 11 |
+
def __init__(
|
| 12 |
+
self, model_name: str = "cl100k_base", max_length: int = 8191, **kwargs
|
| 13 |
+
):
|
| 14 |
+
"""Initialize the tokenizer.
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
model_name: The name of the OpenAI encoding to use
|
| 18 |
+
max_length: Maximum sequence length
|
| 19 |
+
"""
|
| 20 |
+
super().__init__(model_max_length=max_length, **kwargs)
|
| 21 |
+
self.tokenizer = get_encoding(model_name)
|
| 22 |
+
self._vocab_size = self.tokenizer.max_token_value
|
| 23 |
+
|
| 24 |
+
def tokenize(self, text: str, **kwargs) -> List[str]:
|
| 25 |
+
"""Main method used by HybridChunker."""
|
| 26 |
+
return [str(t) for t in self.tokenizer.encode(text)]
|
| 27 |
+
|
| 28 |
+
def _tokenize(self, text: str) -> List[str]:
|
| 29 |
+
return self.tokenize(text)
|
| 30 |
+
|
| 31 |
+
def _convert_token_to_id(self, token: str) -> int:
|
| 32 |
+
return int(token)
|
| 33 |
+
|
| 34 |
+
def _convert_id_to_token(self, index: int) -> str:
|
| 35 |
+
return str(index)
|
| 36 |
+
|
| 37 |
+
def get_vocab(self) -> Dict[str, int]:
|
| 38 |
+
return dict(enumerate(range(self.vocab_size)))
|
| 39 |
+
|
| 40 |
+
@property
|
| 41 |
+
def vocab_size(self) -> int:
|
| 42 |
+
return self._vocab_size
|
| 43 |
+
|
| 44 |
+
def save_vocabulary(self, *args) -> Tuple[str]:
|
| 45 |
+
return ()
|
| 46 |
+
|
| 47 |
+
@classmethod
|
| 48 |
+
def from_pretrained(cls, *args, **kwargs):
|
| 49 |
+
"""Class method to match HuggingFace's interface."""
|
| 50 |
+
return cls()
|
uv.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|