Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- app.py +41 -41
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -49,45 +49,45 @@ with open(data_path, "r") as f:
|
|
| 49 |
# Pre-compute corpus embeddings
|
| 50 |
import re
|
| 51 |
|
| 52 |
-
def split_into_sentences(text):
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
def create_overlapped_chunks(corpus_documents, chunk_size=2, overlap=1):
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
def process_documents_for_chunking(documents):
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
|
| 92 |
# Pre-compute corpus embeddings
|
| 93 |
original_corpus = [item["positive"] for item in dataset]
|
|
@@ -147,8 +147,8 @@ def find_similar(prompt, top_k):
|
|
| 147 |
end_time = time.time()
|
| 148 |
|
| 149 |
results = []
|
| 150 |
-
|
| 151 |
-
for doc, score in doc_score_pairs:
|
| 152 |
results.append((score, doc))
|
| 153 |
|
| 154 |
return results, f"{(end_time - start_time) * 1000:.2f} ms"
|
|
|
|
| 49 |
# Pre-compute corpus embeddings
|
| 50 |
import re
|
| 51 |
|
| 52 |
+
# def split_into_sentences(text):
|
| 53 |
+
# """Splits a paragraph into sentences based on capitalization and punctuation."""
|
| 54 |
+
# # This regex looks for a capital letter, followed by anything that's not a period,
|
| 55 |
+
# # exclamation mark, or question mark, and then ends with one of those punctuation marks.
|
| 56 |
+
# sentences = re.findall(r'([A-Z][^.!?]*[.!?])', text)
|
| 57 |
+
# return sentences
|
| 58 |
+
|
| 59 |
+
# def create_overlapped_chunks(corpus_documents, chunk_size=2, overlap=1):
|
| 60 |
+
# chunked_corpus = []
|
| 61 |
+
# for doc_idx, doc_text in enumerate(corpus_documents):
|
| 62 |
+
# sentences = split_into_sentences(doc_text)
|
| 63 |
+
# if not sentences:
|
| 64 |
+
# continue
|
| 65 |
+
|
| 66 |
+
# # If there are fewer sentences than chunk_size, just use the whole document as one chunk
|
| 67 |
+
# if len(sentences) < chunk_size:
|
| 68 |
+
# chunked_corpus.append({
|
| 69 |
+
# "text": doc_text,
|
| 70 |
+
# "original_doc_idx": doc_idx,
|
| 71 |
+
# "start_sentence_idx": 0,
|
| 72 |
+
# "end_sentence_idx": len(sentences) - 1
|
| 73 |
+
# })
|
| 74 |
+
# continue
|
| 75 |
+
|
| 76 |
+
# for i in range(0, len(sentences) - chunk_size + 1, chunk_size - overlap):
|
| 77 |
+
# chunk_sentences = sentences[i : i + chunk_size]
|
| 78 |
+
# chunk_text = " ".join(chunk_sentences)
|
| 79 |
+
# chunked_corpus.append({
|
| 80 |
+
# "text": chunk_text,
|
| 81 |
+
# "original_doc_idx": doc_idx,
|
| 82 |
+
# "start_sentence_idx": i,
|
| 83 |
+
# "end_sentence_idx": i + chunk_size - 1
|
| 84 |
+
# })
|
| 85 |
+
# return chunked_corpus
|
| 86 |
+
|
| 87 |
+
# def process_documents_for_chunking(documents):
|
| 88 |
+
# chunked_corpus_data = create_overlapped_chunks(documents)
|
| 89 |
+
# flat_corpus_chunks = [item["text"] for item in chunked_corpus_data]
|
| 90 |
+
# return chunked_corpus_data, flat_corpus_chunks
|
| 91 |
|
| 92 |
# Pre-compute corpus embeddings
|
| 93 |
original_corpus = [item["positive"] for item in dataset]
|
|
|
|
| 147 |
end_time = time.time()
|
| 148 |
|
| 149 |
results = []
|
| 150 |
+
for doc, score in doc_score_pairs[:top_k]:
|
| 151 |
+
# for doc, score in doc_score_pairs:
|
| 152 |
results.append((score, doc))
|
| 153 |
|
| 154 |
return results, f"{(end_time - start_time) * 1000:.2f} ms"
|
requirements.txt
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
gradio
|
| 2 |
transformers
|
|
|
|
| 3 |
torch
|
| 4 |
huggingface_hub
|
|
|
|
| 1 |
gradio
|
| 2 |
transformers
|
| 3 |
+
sentence_transformers
|
| 4 |
torch
|
| 5 |
huggingface_hub
|