import os import numpy as np from sentence_transformers import SentenceTransformer from sklearn.cluster import AgglomerativeClustering from scipy.spatial.distance import cosine from nltk.tokenize import sent_tokenize import nltk # Download necessary NLTK resources nltk.download('punkt') # Function to chunk text based on semantic similarity def semantic_chunking(text, model, threshold=0.5): sentences = sent_tokenize(text) embeddings = model.encode(sentences) distances = np.array([[cosine(embeddings[i], embeddings[j]) if i != j else 0 for j in range(len(embeddings))] for i in range(len(embeddings))]) clustering = AgglomerativeClustering(n_clusters=None, metric='precomputed', linkage='average', distance_threshold=threshold) clustering.fit(distances) chunks = [[] for _ in range(clustering.n_clusters_)] for sentence, label in zip(sentences, clustering.labels_): chunks[label].append(sentence) return [' '.join(chunk) for chunk in chunks] # Initialize the sentence-transformer model model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') # Function to process all files in a directory and save them to a new directory def process_directory(input_dir, output_dir): # Create output directory if it doesn't exist if not os.path.exists(output_dir): os.makedirs(output_dir) for filename in os.listdir(input_dir): if filename.endswith('.txt'): input_file_path = os.path.join(input_dir, filename) output_file_path = os.path.join(output_dir, filename) with open(input_file_path, 'r', encoding='utf-8', errors='ignore') as file: text = file.read() try: chunks = semantic_chunking(text, model) with open(output_file_path, 'w', encoding='utf-8') as output_file: for chunk in chunks: output_file.write(chunk + '\n\n') except: print('notchunkable') # Specify the input and output directories input_dir = 'docs_dump' output_dir = 'semchunksBIG' # Process the directory process_directory(input_dir, output_dir)