kolkata97
/

pe-llm-0

Text Classification

text-embeddings-inference

Model card Files Files and versions

kolkata97 commited on Sep 19, 2023

Commit

9d49094

·

1 Parent(s): 36b3d89

Update textsegmentation.py

Files changed (1) hide show

textsegmentation.py +4 -17

textsegmentation.py CHANGED Viewed

@@ -3,29 +3,16 @@ def textsegmentation():
     with open(contract_file_path, 'r') as file:
         contract_text = file.read()
-    # Split the contract text into paragraphs
-    paragraphs = re.split(r'\n\s*\n', contract_text)
-    # Remove leading and trailing whitespace from each paragraph
-    paragraphs = [paragraph.strip() for paragraph in paragraphs]
-    # Remove line breaks within each paragraph
-    paragraphs = [re.sub(r'\s+', ' ', paragraph) for paragraph in paragraphs]
-    new_sentences.append(paragraphs)
-    # Print the extracted clauses
-    for i, clause in enumerate(paragraphs):
-        print(f"Segment {i+1}: {clause}\n")
     # Prepare data for CSV
-    #assign to data only the clause
-    data = [(i+1, paragraph) for i, paragraph in enumerate(paragraphs)]
     # Write the data to CSV file
     with open(output_csv_file, 'w', newline='', encoding='utf-8') as file:
         writer = csv.writer(file)
-        writer.writerow(['Segment ID', 'Segment Text'])  # Write header
         writer.writerows(data)
     print("Output saved to CSV file.")

     with open(contract_file_path, 'r') as file:
         contract_text = file.read()
+    # Tokenize the contract text into sentences
+    sentences = nltk.sent_tokenize(contract_text)
     # Prepare data for CSV
+    data = [(i+1, sentence) for i, sentence in enumerate(sentences)]
     # Write the data to CSV file
     with open(output_csv_file, 'w', newline='', encoding='utf-8') as file:
         writer = csv.writer(file)
+        writer.writerow(['Sentence ID', 'Sentence Text'])  # Write header
         writer.writerows(data)
     print("Output saved to CSV file.")