Spaces:

EgorShibaev
/

GenerativeAI-RAG-project

Runtime error

App Files Files Community

EgorShibaev commited on Feb 18, 2024

Commit

d8b31a6

1 Parent(s): e71def3

scripts

Browse files

Files changed (4) hide show

prep_scripts/chunking.py +58 -0
prep_scripts/lancedb_setup.py +96 -0
prep_scripts/markdown_to_text.py +62 -0
prep_scripts/requirements.txt +9 -0

prep_scripts/chunking.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from langchain.text_splitter import CharacterTextSplitter, NLTKTextSplitter
+import argparse
+from pathlib import Path
+import os
+from tqdm import tqdm
+def fixed_size_chunking(text, chunk_size=256) -> list[str]:
+    splitter = CharacterTextSplitter(
+        separator=" ",
+        chunk_size=chunk_size,
+        chunk_overlap=20
+    )
+    return splitter.split_text(text)
+def content_aware_chunking(text, chunk_size=256) -> list[str]:
+    splitter = NLTKTextSplitter(
+        separator=".",
+        chunk_size = chunk_size,
+        chunk_overlap  = 20
+    )
+    return splitter.split_text(text)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input-dir", help="input directory with text files", type=str,
+                        default="docs")
+    parser.add_argument("--output-dir", help="output directory to store chunked texts", type=str,
+                        default="chunked_docs")
+    parser.add_argument("--chunk-size", help="chunk size", type=int, default=256)
+    parser.add_argument("--chunking-type", help="fixed_size or content_aware", type=str, default="fixed_size")
+    args = parser.parse_args()
+    input_dir = Path(args.input_dir)
+    output_dir = Path(args.output_dir)
+    assert os.path.isdir(input_dir), "Input directory doesn't exist"
+    os.makedirs(output_dir, exist_ok=True)
+    for file in tqdm(input_dir.rglob("*")):
+        if file.is_file():
+            with open(file, 'r', encoding='utf8') as f:
+                text = f.read()
+            if args.chunking_type == "fixed_size":
+                chunked_text = fixed_size_chunking(text, args.chunk_size)
+            elif args.chunking_type == "content_aware":
+                chunked_text = content_aware_chunking(text, args.chunk_size)
+            else:
+                raise ValueError("Invalid chunking type. Choose from 'fixed_size' or 'content_aware'")
+            for i, chunk in enumerate(chunked_text):
+                with open(output_dir / f"{file.stem}_chunk_{i}.txt", "w", encoding='utf8') as f:
+                    f.write(chunk)
+if __name__ == "__main__":
+    main()

prep_scripts/lancedb_setup.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import argparse
+import lancedb
+import torch
+import pyarrow as pa
+import pandas as pd
+from pathlib import Path
+import tqdm
+import numpy as np
+import logging
+from transformers import AutoConfig
+from sentence_transformers import SentenceTransformer
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--emb-model", help="embedding model name on HF hub", type=str)
+    parser.add_argument("--table", help="table name in DB", type=str)
+    parser.add_argument("--input-dir", help="input directory with documents to ingest", type=str)
+    parser.add_argument("--vec-column", help="vector column name in the table", type=str, default="vector")
+    parser.add_argument("--text-column", help="text column name in the table", type=str, default="text")
+    parser.add_argument("--db-loc", help="database location", type=str,
+                        default=str(Path().resolve() / "gradio_app" / ".lancedb"))
+    parser.add_argument("--batch-size", help="batch size for embedding model", type=int, default=32)
+    parser.add_argument("--num-partitions", help="number of partitions for index", type=int, default=256)
+    parser.add_argument("--num-sub-vectors", help="number of sub-vectors for index", type=int, default=96)
+    args = parser.parse_args()
+    emb_config = AutoConfig.from_pretrained(args.emb_model)
+    emb_dimension = emb_config.hidden_size
+    assert emb_dimension % args.num_sub_vectors == 0, \
+        "Embedding size must be divisible by the num of sub vectors"
+    model = SentenceTransformer(args.emb_model)
+    model.eval()
+    if torch.backends.mps.is_available():
+        device = "mps"
+    elif torch.cuda.is_available():
+        device = "cuda"
+    else:
+        device = "cpu"
+    logger.info(f"using {str(device)} device")
+    db = lancedb.connect(args.db_loc)
+    schema = pa.schema(
+      [
+          pa.field(args.vec_column, pa.list_(pa.float32(), emb_dimension)),
+          pa.field(args.text_column, pa.string())
+      ]
+    )
+    tbl = db.create_table(args.table, schema=schema, mode="overwrite")
+    input_dir = Path(args.input_dir)
+    files = list(input_dir.rglob("*"))
+    sentences = []
+    for file in files:
+        with open(file, 'r', encoding='utf8') as f:
+            sentences.append(f.read())
+    for i in tqdm.tqdm(range(0, int(np.ceil(len(sentences) / args.batch_size)))):
+        try:
+            batch = [sent for sent in sentences[i * args.batch_size:(i + 1) * args.batch_size] if len(sent) > 0]
+            encoded = model.encode(batch, normalize_embeddings=True, device=device)
+            encoded = [list(vec) for vec in encoded]
+            df = pd.DataFrame({
+                args.vec_column: encoded,
+                args.text_column: batch
+            })
+            tbl.add(df)
+        except:
+            logger.info(f"batch {i} was skipped")
+    '''
+    create ivf-pd index https://lancedb.github.io/lancedb/ann_indexes/
+    with the size of the transformer docs, index is not really needed
+    but we'll do it for demonstrational purposes
+    '''
+    tbl.create_index(
+        num_partitions=args.num_partitions,
+        num_sub_vectors=args.num_sub_vectors,
+        vector_column_name=args.vec_column
+    )
+if __name__ == "__main__":
+    main()

prep_scripts/markdown_to_text.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import argparse
+import os
+import re
+from tqdm import tqdm
+from bs4 import BeautifulSoup
+from markdown import markdown
+from pathlib import Path
+def markdown_to_text(markdown_string):
+    """ Converts a markdown string to plaintext """
+    # md -> html -> text since BeautifulSoup can extract text cleanly
+    html = markdown(markdown_string)
+    html = re.sub(r'<!--((.|\n)*)-->', '', html)
+    html = re.sub('<code>bash', '<code>', html)
+    # extract text
+    soup = BeautifulSoup(html, "html.parser")
+    text = ''.join(soup.findAll(text=True))
+    text = re.sub('```(py|diff|python)', '', text)
+    text = re.sub('```\n', '\n', text)
+    text = re.sub('-         .*', '', text)
+    text = text.replace('...', '')
+    text = re.sub('\n(\n)+', '\n\n', text)
+    return text
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input-dir", help="input directory with markdown", type=str,
+                        default="transformers/docs/source/en/")
+    parser.add_argument("--output-dir", help="output directory to store raw texts", type=str,
+                        default="docs")
+    args = parser.parse_args()
+    input_dir = Path(args.input_dir)
+    output_dir = Path(args.output_dir)
+    assert os.path.isdir(input_dir), "Input directory doesn't exist"
+    files = input_dir.rglob("*")
+    os.makedirs(output_dir, exist_ok=True)
+    for file in tqdm(files):
+        parent = file.parent.stem if file.parent.stem != input_dir.stem else ""
+        if file.is_file():
+            with open(file, 'r', encoding='utf8') as f:
+                md = f.read()
+            text = markdown_to_text(md)
+            with open(output_dir / f"{parent}_{file.stem}.txt", "w", encoding='utf8') as f:
+                f.write(text)
+if __name__ == "__main__":
+    main()

prep_scripts/requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+bs4==0.0.1
+lancedb==0.5.3
+markdown==3.5.1
+numpy==1.26.2
+pandas==2.1.3
+pyarrow==14.0.1
+sentence-transformers==2.3.1
+tqdm==4.66.1
+torch==2.1.1