Spaces:
Paused
Paused
Commit
·
cbd72a8
1
Parent(s):
2b2b23b
fix chunking
Browse files
app.py
CHANGED
|
@@ -1,14 +1,9 @@
|
|
| 1 |
import chromadb
|
| 2 |
import gradio
|
| 3 |
-
import nltk
|
| 4 |
import sentence_transformers
|
| 5 |
from chromadb.config import Settings
|
| 6 |
import gradio as gr
|
| 7 |
|
| 8 |
-
from nltk import sent_tokenize
|
| 9 |
-
|
| 10 |
-
nltk.download('punkt')
|
| 11 |
-
# The above code snippet was generated by ChatGPT 5.0 at 11:33 a 11/17/25
|
| 12 |
|
| 13 |
# Provide your own dataset here. Decide on a chunking strategy and implement here.
|
| 14 |
texts = [
|
|
@@ -48,11 +43,8 @@ texts = [
|
|
| 48 |
]
|
| 49 |
|
| 50 |
# Chunking by sentence
|
| 51 |
-
sentences = []
|
| 52 |
-
|
| 53 |
-
s = sent_tokenize(paragraph)
|
| 54 |
-
sentences.extend(s)
|
| 55 |
-
# The above code snippet was generated by ChatGPT 5.0 at 11:33 a 11/17/25
|
| 56 |
|
| 57 |
# Generate embeddings
|
| 58 |
from sentence_transformers import SentenceTransformer
|
|
|
|
| 1 |
import chromadb
|
| 2 |
import gradio
|
|
|
|
| 3 |
import sentence_transformers
|
| 4 |
from chromadb.config import Settings
|
| 5 |
import gradio as gr
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
# Provide your own dataset here. Decide on a chunking strategy and implement here.
|
| 9 |
texts = [
|
|
|
|
| 43 |
]
|
| 44 |
|
| 45 |
# Chunking by sentence
|
| 46 |
+
sentences = [sentence for paragraph in texts for sentence in paragraph.split(". ")]
|
| 47 |
+
# The above code snippet was generated by ChatGPT 5.0 at 3:31p 11/17/25
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
# Generate embeddings
|
| 50 |
from sentence_transformers import SentenceTransformer
|