amiraghhh commited on
Commit
cb7eb08
·
verified ·
1 Parent(s): ebfdd83

Delete utils.py

Browse files
Files changed (1) hide show
  1. utils.py +0 -148
utils.py DELETED
@@ -1,148 +0,0 @@
1
- """
2
- Utility functions for text processing and chunking in the RAG pipeline.
3
- """
4
-
5
- import re
6
- from langchain_text_splitters import RecursiveCharacterTextSplitter
7
- from transformers import AutoTokenizer
8
-
9
-
10
- def clean_references(text):
11
- """Remove references, contact info, irrelevant sentences, and unnecessary punctuation.
12
- Keep only periods, commas, apostrophes, and question marks.
13
-
14
- Args:
15
- text (str): Raw text to clean
16
-
17
- Returns:
18
- str: Cleaned text
19
- """
20
- if not isinstance(text, str):
21
- return ""
22
-
23
- # Lowercase
24
- text = text.lower()
25
-
26
- # Remove contact information patterns
27
- phone_pattern = r'\b\d{3}-\d{3}-\d{4}\b|\b\d{3}-\d{2}-\d{4}\b|\b1-\d{3}-\d{3}-\d{4}\b|\b\d{4}-\d{3}-\d{4}\b|\bToll Free:.*?\b'
28
- email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
29
- url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
30
- address_pattern = r'\d+\s+[A-Za-z\s]+,\s*[A-Za-z\s]+,\s*[A-Z]{2}\s*\d{5}(-\d{4})?'
31
-
32
- # Remove patterns
33
- text = re.sub(phone_pattern, '', text, flags=re.IGNORECASE)
34
- text = re.sub(email_pattern, '', text)
35
- text = re.sub(url_pattern, '', text)
36
- text = re.sub(address_pattern, '', text)
37
-
38
- # Remove irrelevant keywords and sentences
39
- irrelevant_keywords = [
40
- 'toll free', 'toll-free', 'phone', 'email', 'fax', 'tty',
41
- 'for more information', 'learn more', 'www', 'click', 'visit', 'call',
42
- 'website', 'websites', 'see also', 'read more', 'see the pronunciation',
43
- 'clearinghouse', 'esc', 'keyboard', 'video', 'glossary', 'chapter',
44
- 'section', 'version', 'copyright', 'download', 'archived',
45
- 'nci', 'niddk', 'national institute', 'american journal'
46
- ]
47
-
48
- # Split by sentence-ending punctuation
49
- sentences = re.split(r'(?<=[.!?])\s+', text)
50
-
51
- # Filter sentences with irrelevant keywords
52
- cleaned_sentences = [s for s in sentences if not any(keyword in s.lower() for keyword in irrelevant_keywords)]
53
-
54
- # Smart join with proper punctuation
55
- cleaned_text = ""
56
- for i, sentence in enumerate(cleaned_sentences):
57
- if i == 0:
58
- cleaned_text = sentence
59
- else:
60
- if cleaned_text and not cleaned_text.endswith(('.', '?')):
61
- cleaned_text += '. '
62
- else:
63
- cleaned_text += ' '
64
- cleaned_text += sentence
65
-
66
- cleaned_text = cleaned_text.strip()
67
- cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
68
- cleaned_text = re.sub(r'^\s*\.\s*', '', cleaned_text)
69
-
70
- # Keep only allowed punctuation
71
- allowed_punct = {'.', ',', "'", '?'}
72
- text_minimal_punct = ""
73
- for char in cleaned_text:
74
- if char.isalnum() or char.isspace() or char in allowed_punct:
75
- text_minimal_punct += char
76
-
77
- cleaned_text = ' '.join(text_minimal_punct.split()).strip()
78
-
79
- # Ensure text ends with period
80
- if cleaned_text and not cleaned_text.endswith(('.', '?')):
81
- cleaned_text += '.'
82
-
83
- return cleaned_text
84
-
85
-
86
- def create_chunks(dataframe, tokenizer, chunk_size=350, chunk_overlap=50):
87
- """Split dataframe answers into chunks with metadata.
88
-
89
- Args:
90
- dataframe (pd.DataFrame): DataFrame with 'question' and 'answer' columns
91
- tokenizer: HuggingFace tokenizer for token counting
92
- chunk_size (int): Target tokens per chunk
93
- chunk_overlap (int): Overlap tokens between chunks
94
-
95
- Returns:
96
- list: List of chunk dictionaries with metadata
97
- """
98
- splitter = RecursiveCharacterTextSplitter(
99
- chunk_size=chunk_size,
100
- chunk_overlap=chunk_overlap,
101
- length_function=lambda x: len(tokenizer.encode(x)),
102
- )
103
-
104
- chunks = []
105
- question_id_counter = 0
106
-
107
- for i, row in dataframe.iterrows():
108
- question = row.get('question', '')
109
- answer = row.get('answer', '')
110
- focus_area = row.get('focus_area', 'Unknown')
111
- source = row.get('source', 'Unknown')
112
-
113
- # Split answer into chunks
114
- answer_chunks = splitter.split_text(answer)
115
-
116
- for chunk_local_id, chunk_text in enumerate(answer_chunks):
117
- chunks.append({
118
- 'question_id': question_id_counter,
119
- 'chunk_id': f"{question_id_counter}_{chunk_local_id}",
120
- 'question': question,
121
- 'chunk_answer': chunk_text,
122
- 'focus_area': focus_area,
123
- 'source': source,
124
- })
125
-
126
- question_id_counter += 1
127
-
128
- return chunks
129
-
130
-
131
- def create_embeddings(documents, embed_model):
132
- """Create embeddings for documents using SentenceTransformer.
133
-
134
- Args:
135
- documents (list): List of text documents to embed
136
- embed_model: SentenceTransformer model instance
137
-
138
- Returns:
139
- np.ndarray: Embeddings array
140
- """
141
- embeddings = embed_model.encode(
142
- documents,
143
- batch_size=64,
144
- show_progress_bar=False,
145
- convert_to_numpy=True,
146
- normalize_embeddings=True
147
- )
148
- return embeddings