Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -13,15 +13,31 @@ with open("Untitled document.txt", "r", encoding="utf-8") as f:
|
|
| 13 |
|
| 14 |
# Step 2: Preprocess text into sentence chunks
|
| 15 |
def preprocess_text(text):
|
| 16 |
-
|
| 17 |
cleaned_text = text.strip()
|
| 18 |
-
|
| 19 |
-
|
| 20 |
|
|
|
|
| 21 |
for i in range(0, len(sentences), 2):
|
| 22 |
chunk = '. '.join(sentences[i:i+3]).strip()
|
| 23 |
-
|
|
|
|
|
|
|
| 24 |
paragraphs = [p.strip() for p in cleaned_text.split('\n\n') if p.strip()]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
cleaned_chunks = preprocess_text(skincare_text)
|
| 27 |
|
|
|
|
| 13 |
|
| 14 |
# Step 2: Preprocess text into sentence chunks
|
| 15 |
def preprocess_text(text):
|
|
|
|
| 16 |
cleaned_text = text.strip()
|
| 17 |
+
sentences = [s.strip() for s in cleaned_text.split('.') if s.strip()]
|
| 18 |
+
sentence_chunks = [s.strip() for s in sentences if len(s.strip()) > 10]
|
| 19 |
|
| 20 |
+
combined_chunks = []
|
| 21 |
for i in range(0, len(sentences), 2):
|
| 22 |
chunk = '. '.join(sentences[i:i+3]).strip()
|
| 23 |
+
if len(chunk) > 20:
|
| 24 |
+
combined_chunks.append(chunk)
|
| 25 |
+
|
| 26 |
paragraphs = [p.strip() for p in cleaned_text.split('\n\n') if p.strip()]
|
| 27 |
+
paragraph_chunks = [p for p in paragraphs if len(p) > 30]
|
| 28 |
+
|
| 29 |
+
all_chunks = sentence_chunks + combined_chunks + paragraph_chunks
|
| 30 |
+
|
| 31 |
+
seen = set()
|
| 32 |
+
final_chunks = []
|
| 33 |
+
for chunk in all_chunks:
|
| 34 |
+
if chunk not in seen and len(chunk) > 15:
|
| 35 |
+
seen.add(chunk)
|
| 36 |
+
final_chunks.append(chunk)
|
| 37 |
+
|
| 38 |
+
print(f"Created {len(final_chunks)} chunks using advanced strategy")
|
| 39 |
+
print(f"Sample chunks: {final_chunks[:3]}")
|
| 40 |
+
return final_chunks
|
| 41 |
|
| 42 |
cleaned_chunks = preprocess_text(skincare_text)
|
| 43 |
|