Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
#
|
| 2 |
# HACK AI MAKERSPACE PREPR
|
| 3 |
# Date: 2024-5-16
|
| 4 |
|
|
@@ -25,11 +24,12 @@ direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
|
|
| 25 |
# Now load the document using the direct URL
|
| 26 |
docs = PyMuPDFLoader(direct_url).load()
|
| 27 |
|
| 28 |
-
import
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
)
|
|
|
|
| 33 |
return len(tokens)
|
| 34 |
|
| 35 |
# Split the document into chunks
|
|
@@ -38,7 +38,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
| 38 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 39 |
chunk_size = 500, # 500 tokens per chunk, experiment with this value
|
| 40 |
chunk_overlap = 50, # 50 tokens overlap between chunks, experiment with this value
|
| 41 |
-
length_function =
|
| 42 |
)
|
| 43 |
|
| 44 |
split_chunks = text_splitter.split_documents(docs)
|
|
@@ -146,5 +146,3 @@ async def main(message: cl.Message):
|
|
| 146 |
|
| 147 |
msg = cl.Message(content=chainlit_answer)
|
| 148 |
await msg.send()
|
| 149 |
-
|
| 150 |
-
|
|
|
|
|
|
|
| 1 |
# HACK AI MAKERSPACE PREPR
|
| 2 |
# Date: 2024-5-16
|
| 3 |
|
|
|
|
| 24 |
# Now load the document using the direct URL
|
| 25 |
docs = PyMuPDFLoader(direct_url).load()
|
| 26 |
|
| 27 |
+
from transformers import AutoTokenizer
|
| 28 |
+
|
| 29 |
+
# Function to calculate token length using Hugging Face tokenizer
|
| 30 |
+
def hf_token_len(text):
|
| 31 |
+
tokenizer = AutoTokenizer.from_pretrained("Upstage/SOLAR-10.7B-v1.0")
|
| 32 |
+
tokens = tokenizer.encode(text)
|
| 33 |
return len(tokens)
|
| 34 |
|
| 35 |
# Split the document into chunks
|
|
|
|
| 38 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 39 |
chunk_size = 500, # 500 tokens per chunk, experiment with this value
|
| 40 |
chunk_overlap = 50, # 50 tokens overlap between chunks, experiment with this value
|
| 41 |
+
length_function = hf_token_len,
|
| 42 |
)
|
| 43 |
|
| 44 |
split_chunks = text_splitter.split_documents(docs)
|
|
|
|
| 146 |
|
| 147 |
msg = cl.Message(content=chainlit_answer)
|
| 148 |
await msg.send()
|
|
|
|
|
|