File size: 1,545 Bytes
cddae53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
## GENERATES RAG DOCS FROM RAG-CORPUS

from transformers import AutoTokenizer
import os
import re

tokenizer = AutoTokenizer.from_pretrained("rasyosef/phi-2-instruct-v0.1")

def split_into_chunks(text, max_tokens=80):
    sentences = re.split(r'(?<=[.!?]) +', text)

    chunks = []
    current_chunk = ""

    for sentence in sentences:
        tentative = current_chunk + " " + sentence if current_chunk else sentence
        tokenized = tokenizer(tentative, truncation=False, return_tensors="np")
        token_count = len(tokenized["input_ids"][0])

        if token_count <= max_tokens:
            current_chunk = tentative
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

def process_all_files(root_dir="rag-corpus", output_file="rag-corpus/rag_docs.txt"):
    all_chunks = []

    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith(".txt"):
                file_path = os.path.join(root, file)
                with open(file_path, "r", encoding="utf-8") as f:
                    text = f.read()
                    chunks = split_into_chunks(text, max_tokens=50)
                    all_chunks.extend(chunks)

    with open(output_file, "w", encoding="utf-8") as out:
        out.write("\n---\n".join(all_chunks))

if __name__ == "__main__":
    process_all_files()