Delete utils.py
Browse files
utils.py
DELETED
|
@@ -1,148 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Utility functions for text processing and chunking in the RAG pipeline.
|
| 3 |
-
"""
|
| 4 |
-
|
| 5 |
-
import re
|
| 6 |
-
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 7 |
-
from transformers import AutoTokenizer
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
def clean_references(text):
|
| 11 |
-
"""Remove references, contact info, irrelevant sentences, and unnecessary punctuation.
|
| 12 |
-
Keep only periods, commas, apostrophes, and question marks.
|
| 13 |
-
|
| 14 |
-
Args:
|
| 15 |
-
text (str): Raw text to clean
|
| 16 |
-
|
| 17 |
-
Returns:
|
| 18 |
-
str: Cleaned text
|
| 19 |
-
"""
|
| 20 |
-
if not isinstance(text, str):
|
| 21 |
-
return ""
|
| 22 |
-
|
| 23 |
-
# Lowercase
|
| 24 |
-
text = text.lower()
|
| 25 |
-
|
| 26 |
-
# Remove contact information patterns
|
| 27 |
-
phone_pattern = r'\b\d{3}-\d{3}-\d{4}\b|\b\d{3}-\d{2}-\d{4}\b|\b1-\d{3}-\d{3}-\d{4}\b|\b\d{4}-\d{3}-\d{4}\b|\bToll Free:.*?\b'
|
| 28 |
-
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
|
| 29 |
-
url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
|
| 30 |
-
address_pattern = r'\d+\s+[A-Za-z\s]+,\s*[A-Za-z\s]+,\s*[A-Z]{2}\s*\d{5}(-\d{4})?'
|
| 31 |
-
|
| 32 |
-
# Remove patterns
|
| 33 |
-
text = re.sub(phone_pattern, '', text, flags=re.IGNORECASE)
|
| 34 |
-
text = re.sub(email_pattern, '', text)
|
| 35 |
-
text = re.sub(url_pattern, '', text)
|
| 36 |
-
text = re.sub(address_pattern, '', text)
|
| 37 |
-
|
| 38 |
-
# Remove irrelevant keywords and sentences
|
| 39 |
-
irrelevant_keywords = [
|
| 40 |
-
'toll free', 'toll-free', 'phone', 'email', 'fax', 'tty',
|
| 41 |
-
'for more information', 'learn more', 'www', 'click', 'visit', 'call',
|
| 42 |
-
'website', 'websites', 'see also', 'read more', 'see the pronunciation',
|
| 43 |
-
'clearinghouse', 'esc', 'keyboard', 'video', 'glossary', 'chapter',
|
| 44 |
-
'section', 'version', 'copyright', 'download', 'archived',
|
| 45 |
-
'nci', 'niddk', 'national institute', 'american journal'
|
| 46 |
-
]
|
| 47 |
-
|
| 48 |
-
# Split by sentence-ending punctuation
|
| 49 |
-
sentences = re.split(r'(?<=[.!?])\s+', text)
|
| 50 |
-
|
| 51 |
-
# Filter sentences with irrelevant keywords
|
| 52 |
-
cleaned_sentences = [s for s in sentences if not any(keyword in s.lower() for keyword in irrelevant_keywords)]
|
| 53 |
-
|
| 54 |
-
# Smart join with proper punctuation
|
| 55 |
-
cleaned_text = ""
|
| 56 |
-
for i, sentence in enumerate(cleaned_sentences):
|
| 57 |
-
if i == 0:
|
| 58 |
-
cleaned_text = sentence
|
| 59 |
-
else:
|
| 60 |
-
if cleaned_text and not cleaned_text.endswith(('.', '?')):
|
| 61 |
-
cleaned_text += '. '
|
| 62 |
-
else:
|
| 63 |
-
cleaned_text += ' '
|
| 64 |
-
cleaned_text += sentence
|
| 65 |
-
|
| 66 |
-
cleaned_text = cleaned_text.strip()
|
| 67 |
-
cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
|
| 68 |
-
cleaned_text = re.sub(r'^\s*\.\s*', '', cleaned_text)
|
| 69 |
-
|
| 70 |
-
# Keep only allowed punctuation
|
| 71 |
-
allowed_punct = {'.', ',', "'", '?'}
|
| 72 |
-
text_minimal_punct = ""
|
| 73 |
-
for char in cleaned_text:
|
| 74 |
-
if char.isalnum() or char.isspace() or char in allowed_punct:
|
| 75 |
-
text_minimal_punct += char
|
| 76 |
-
|
| 77 |
-
cleaned_text = ' '.join(text_minimal_punct.split()).strip()
|
| 78 |
-
|
| 79 |
-
# Ensure text ends with period
|
| 80 |
-
if cleaned_text and not cleaned_text.endswith(('.', '?')):
|
| 81 |
-
cleaned_text += '.'
|
| 82 |
-
|
| 83 |
-
return cleaned_text
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
def create_chunks(dataframe, tokenizer, chunk_size=350, chunk_overlap=50):
|
| 87 |
-
"""Split dataframe answers into chunks with metadata.
|
| 88 |
-
|
| 89 |
-
Args:
|
| 90 |
-
dataframe (pd.DataFrame): DataFrame with 'question' and 'answer' columns
|
| 91 |
-
tokenizer: HuggingFace tokenizer for token counting
|
| 92 |
-
chunk_size (int): Target tokens per chunk
|
| 93 |
-
chunk_overlap (int): Overlap tokens between chunks
|
| 94 |
-
|
| 95 |
-
Returns:
|
| 96 |
-
list: List of chunk dictionaries with metadata
|
| 97 |
-
"""
|
| 98 |
-
splitter = RecursiveCharacterTextSplitter(
|
| 99 |
-
chunk_size=chunk_size,
|
| 100 |
-
chunk_overlap=chunk_overlap,
|
| 101 |
-
length_function=lambda x: len(tokenizer.encode(x)),
|
| 102 |
-
)
|
| 103 |
-
|
| 104 |
-
chunks = []
|
| 105 |
-
question_id_counter = 0
|
| 106 |
-
|
| 107 |
-
for i, row in dataframe.iterrows():
|
| 108 |
-
question = row.get('question', '')
|
| 109 |
-
answer = row.get('answer', '')
|
| 110 |
-
focus_area = row.get('focus_area', 'Unknown')
|
| 111 |
-
source = row.get('source', 'Unknown')
|
| 112 |
-
|
| 113 |
-
# Split answer into chunks
|
| 114 |
-
answer_chunks = splitter.split_text(answer)
|
| 115 |
-
|
| 116 |
-
for chunk_local_id, chunk_text in enumerate(answer_chunks):
|
| 117 |
-
chunks.append({
|
| 118 |
-
'question_id': question_id_counter,
|
| 119 |
-
'chunk_id': f"{question_id_counter}_{chunk_local_id}",
|
| 120 |
-
'question': question,
|
| 121 |
-
'chunk_answer': chunk_text,
|
| 122 |
-
'focus_area': focus_area,
|
| 123 |
-
'source': source,
|
| 124 |
-
})
|
| 125 |
-
|
| 126 |
-
question_id_counter += 1
|
| 127 |
-
|
| 128 |
-
return chunks
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
def create_embeddings(documents, embed_model):
|
| 132 |
-
"""Create embeddings for documents using SentenceTransformer.
|
| 133 |
-
|
| 134 |
-
Args:
|
| 135 |
-
documents (list): List of text documents to embed
|
| 136 |
-
embed_model: SentenceTransformer model instance
|
| 137 |
-
|
| 138 |
-
Returns:
|
| 139 |
-
np.ndarray: Embeddings array
|
| 140 |
-
"""
|
| 141 |
-
embeddings = embed_model.encode(
|
| 142 |
-
documents,
|
| 143 |
-
batch_size=64,
|
| 144 |
-
show_progress_bar=False,
|
| 145 |
-
convert_to_numpy=True,
|
| 146 |
-
normalize_embeddings=True
|
| 147 |
-
)
|
| 148 |
-
return embeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|