Spaces:

MikeCraBash
/

hack6124

Runtime error

MikeCraBash commited on Jun 1, 2024

Commit

ecbe7ae

verified ·

1 Parent(s): 7184bb7

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-#
 # HACK AI MAKERSPACE PREPR
 # Date: 2024-5-16
@@ -25,11 +24,12 @@ direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
 # Now load the document using the direct URL
 docs = PyMuPDFLoader(direct_url).load()
-import tiktoken
-def tiktoken_len(text):
-    tokens = tiktoken.encoding_for_model("solar-10.7b").encode(
-        text,
-    )
     return len(tokens)
 # Split the document into chunks
@@ -38,7 +38,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
 text_splitter = RecursiveCharacterTextSplitter(
     chunk_size = 500,           # 500 tokens per chunk, experiment with this value
     chunk_overlap = 50,        # 50 tokens overlap between chunks, experiment with this value
-    length_function = tiktoken_len,
 )
 split_chunks = text_splitter.split_documents(docs)
@@ -146,5 +146,3 @@ async def main(message: cl.Message):
     msg = cl.Message(content=chainlit_answer)
     await msg.send()

 # HACK AI MAKERSPACE PREPR
 # Date: 2024-5-16
 # Now load the document using the direct URL
 docs = PyMuPDFLoader(direct_url).load()
+from transformers import AutoTokenizer
+# Function to calculate token length using Hugging Face tokenizer
+def hf_token_len(text):
+    tokenizer = AutoTokenizer.from_pretrained("Upstage/SOLAR-10.7B-v1.0")
+    tokens = tokenizer.encode(text)
     return len(tokens)
 # Split the document into chunks
 text_splitter = RecursiveCharacterTextSplitter(
     chunk_size = 500,           # 500 tokens per chunk, experiment with this value
     chunk_overlap = 50,        # 50 tokens overlap between chunks, experiment with this value
+    length_function = hf_token_len,
 )
 split_chunks = text_splitter.split_documents(docs)
     msg = cl.Message(content=chainlit_answer)
     await msg.send()