File size: 1,060 Bytes
d119b72
ded7051
 
a9ae09a
 
 
 
ded7051
 
a9ae09a
ded7051
a9ae09a
 
 
 
 
 
 
 
ded7051
 
a9ae09a
a6c97c1
 
 
 
 
 
 
 
 
 
 
d119b72
 
 
 
 
bc2c75b
 
 
 
 
d119b72
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import re
from pathlib import Path

from dotenv import load_dotenv

load_dotenv()


class Paths:
    DATA: Path = Path("data")
    ADR = DATA / "adr"
    UKDS = DATA / "ukds"
    CDRC = DATA / "cdrc"

    @classmethod
    def ensure_directories_exist(cls):
        cls.ADR.mkdir(parents=True, exist_ok=True)
        cls.UKDS.mkdir(parents=True, exist_ok=True)
        cls.CDRC.mkdir(parents=True, exist_ok=True)


Paths.ensure_directories_exist()


def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [
                f"Document {i+1}:\n\n{d.page_content}\nMetadata: {d.metadata}"
                for i, d in enumerate(docs)
            ]
        )
    )


def clean_string(text: str) -> str:
    text = re.sub(r"/\*.*?\*/", "", text, flags=re.DOTALL)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n\s*\n", " <PARAGRAPH_BREAK> ", text)
    text = re.sub(r"\s*\n\s*", " ", text)
    text = text.replace(" <PARAGRAPH_BREAK> ", "\n\n")
    text = text.strip()
    return text