vivianoh commited on
Commit
c6f5f0c
·
verified ·
1 Parent(s): 9d35137

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -4
app.py CHANGED
@@ -13,15 +13,31 @@ with open("Untitled document.txt", "r", encoding="utf-8") as f:
13
 
14
  # Step 2: Preprocess text into sentence chunks
15
  def preprocess_text(text):
16
-
17
  cleaned_text = text.strip()
18
- chunks = cleaned_text.split(".")
19
- combined_chunks = []
20
 
 
21
  for i in range(0, len(sentences), 2):
22
  chunk = '. '.join(sentences[i:i+3]).strip()
23
-
 
 
24
  paragraphs = [p.strip() for p in cleaned_text.split('\n\n') if p.strip()]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  cleaned_chunks = preprocess_text(skincare_text)
27
 
 
13
 
14
  # Step 2: Preprocess text into sentence chunks
15
  def preprocess_text(text):
 
16
  cleaned_text = text.strip()
17
+ sentences = [s.strip() for s in cleaned_text.split('.') if s.strip()]
18
+ sentence_chunks = [s.strip() for s in sentences if len(s.strip()) > 10]
19
 
20
+ combined_chunks = []
21
  for i in range(0, len(sentences), 2):
22
  chunk = '. '.join(sentences[i:i+3]).strip()
23
+ if len(chunk) > 20:
24
+ combined_chunks.append(chunk)
25
+
26
  paragraphs = [p.strip() for p in cleaned_text.split('\n\n') if p.strip()]
27
+ paragraph_chunks = [p for p in paragraphs if len(p) > 30]
28
+
29
+ all_chunks = sentence_chunks + combined_chunks + paragraph_chunks
30
+
31
+ seen = set()
32
+ final_chunks = []
33
+ for chunk in all_chunks:
34
+ if chunk not in seen and len(chunk) > 15:
35
+ seen.add(chunk)
36
+ final_chunks.append(chunk)
37
+
38
+ print(f"Created {len(final_chunks)} chunks using advanced strategy")
39
+ print(f"Sample chunks: {final_chunks[:3]}")
40
+ return final_chunks
41
 
42
  cleaned_chunks = preprocess_text(skincare_text)
43