Spaces:
Sleeping
Sleeping
| ## GENERATES RAG DOCS FROM RAG-CORPUS | |
| from transformers import AutoTokenizer | |
| import os | |
| import re | |
| tokenizer = AutoTokenizer.from_pretrained("rasyosef/phi-2-instruct-v0.1") | |
| def split_into_chunks(text, max_tokens=80): | |
| sentences = re.split(r'(?<=[.!?]) +', text) | |
| chunks = [] | |
| current_chunk = "" | |
| for sentence in sentences: | |
| tentative = current_chunk + " " + sentence if current_chunk else sentence | |
| tokenized = tokenizer(tentative, truncation=False, return_tensors="np") | |
| token_count = len(tokenized["input_ids"][0]) | |
| if token_count <= max_tokens: | |
| current_chunk = tentative | |
| else: | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = sentence | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| return chunks | |
| def process_all_files(root_dir="rag-corpus", output_file="rag-corpus/rag_docs.txt"): | |
| all_chunks = [] | |
| for root, dirs, files in os.walk(root_dir): | |
| for file in files: | |
| if file.endswith(".txt"): | |
| file_path = os.path.join(root, file) | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| text = f.read() | |
| chunks = split_into_chunks(text, max_tokens=50) | |
| all_chunks.extend(chunks) | |
| with open(output_file, "w", encoding="utf-8") as out: | |
| out.write("\n---\n".join(all_chunks)) | |
| if __name__ == "__main__": | |
| process_all_files() | |