Spaces:
Sleeping
Sleeping
File size: 1,060 Bytes
d119b72 ded7051 a9ae09a ded7051 a9ae09a ded7051 a9ae09a ded7051 a9ae09a a6c97c1 d119b72 bc2c75b d119b72 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | import re
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()
class Paths:
DATA: Path = Path("data")
ADR = DATA / "adr"
UKDS = DATA / "ukds"
CDRC = DATA / "cdrc"
@classmethod
def ensure_directories_exist(cls):
cls.ADR.mkdir(parents=True, exist_ok=True)
cls.UKDS.mkdir(parents=True, exist_ok=True)
cls.CDRC.mkdir(parents=True, exist_ok=True)
Paths.ensure_directories_exist()
def pretty_print_docs(docs):
print(
f"\n{'-' * 100}\n".join(
[
f"Document {i+1}:\n\n{d.page_content}\nMetadata: {d.metadata}"
for i, d in enumerate(docs)
]
)
)
def clean_string(text: str) -> str:
text = re.sub(r"/\*.*?\*/", "", text, flags=re.DOTALL)
text = re.sub(r"<.*?>", "", text)
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r"\n\s*\n", " <PARAGRAPH_BREAK> ", text)
text = re.sub(r"\s*\n\s*", " ", text)
text = text.replace(" <PARAGRAPH_BREAK> ", "\n\n")
text = text.strip()
return text
|