Spaces:

amiraghhh
/

testing

No application file

App Files Files Community

amiraghhh commited on Jan 1

Commit

cb7eb08

verified ·

1 Parent(s): ebfdd83

Delete utils.py

Browse files

Files changed (1) hide show

utils.py +0 -148

utils.py DELETED Viewed

@@ -1,148 +0,0 @@
-"""
-Utility functions for text processing and chunking in the RAG pipeline.
-"""
-import re
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from transformers import AutoTokenizer
-def clean_references(text):
-    """Remove references, contact info, irrelevant sentences, and unnecessary punctuation.
-    Keep only periods, commas, apostrophes, and question marks.
-    Args:
-        text (str): Raw text to clean
-    Returns:
-        str: Cleaned text
-    """
-    if not isinstance(text, str):
-        return ""
-    # Lowercase
-    text = text.lower()
-    # Remove contact information patterns
-    phone_pattern = r'\b\d{3}-\d{3}-\d{4}\b|\b\d{3}-\d{2}-\d{4}\b|\b1-\d{3}-\d{3}-\d{4}\b|\b\d{4}-\d{3}-\d{4}\b|\bToll Free:.*?\b'
-    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
-    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
-    address_pattern = r'\d+\s+[A-Za-z\s]+,\s*[A-Za-z\s]+,\s*[A-Z]{2}\s*\d{5}(-\d{4})?'
-    # Remove patterns
-    text = re.sub(phone_pattern, '', text, flags=re.IGNORECASE)
-    text = re.sub(email_pattern, '', text)
-    text = re.sub(url_pattern, '', text)
-    text = re.sub(address_pattern, '', text)
-    # Remove irrelevant keywords and sentences
-    irrelevant_keywords = [
-        'toll free', 'toll-free', 'phone', 'email', 'fax', 'tty',
-        'for more information', 'learn more', 'www', 'click', 'visit', 'call',
-        'website', 'websites', 'see also', 'read more', 'see the pronunciation',
-        'clearinghouse', 'esc', 'keyboard', 'video', 'glossary', 'chapter',
-        'section', 'version', 'copyright', 'download', 'archived',
-        'nci', 'niddk', 'national institute', 'american journal'
-    ]
-    # Split by sentence-ending punctuation
-    sentences = re.split(r'(?<=[.!?])\s+', text)
-    # Filter sentences with irrelevant keywords
-    cleaned_sentences = [s for s in sentences if not any(keyword in s.lower() for keyword in irrelevant_keywords)]
-    # Smart join with proper punctuation
-    cleaned_text = ""
-    for i, sentence in enumerate(cleaned_sentences):
-        if i == 0:
-            cleaned_text = sentence
-        else:
-            if cleaned_text and not cleaned_text.endswith(('.', '?')):
-                cleaned_text += '. '
-            else:
-                cleaned_text += ' '
-            cleaned_text += sentence
-    cleaned_text = cleaned_text.strip()
-    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
-    cleaned_text = re.sub(r'^\s*\.\s*', '', cleaned_text)
-    # Keep only allowed punctuation
-    allowed_punct = {'.', ',', "'", '?'}
-    text_minimal_punct = ""
-    for char in cleaned_text:
-        if char.isalnum() or char.isspace() or char in allowed_punct:
-            text_minimal_punct += char
-    cleaned_text = ' '.join(text_minimal_punct.split()).strip()
-    # Ensure text ends with period
-    if cleaned_text and not cleaned_text.endswith(('.', '?')):
-        cleaned_text += '.'
-    return cleaned_text
-def create_chunks(dataframe, tokenizer, chunk_size=350, chunk_overlap=50):
-    """Split dataframe answers into chunks with metadata.
-    Args:
-        dataframe (pd.DataFrame): DataFrame with 'question' and 'answer' columns
-        tokenizer: HuggingFace tokenizer for token counting
-        chunk_size (int): Target tokens per chunk
-        chunk_overlap (int): Overlap tokens between chunks
-    Returns:
-        list: List of chunk dictionaries with metadata
-    """
-    splitter = RecursiveCharacterTextSplitter(
-        chunk_size=chunk_size,
-        chunk_overlap=chunk_overlap,
-        length_function=lambda x: len(tokenizer.encode(x)),
-    )
-    chunks = []
-    question_id_counter = 0
-    for i, row in dataframe.iterrows():
-        question = row.get('question', '')
-        answer = row.get('answer', '')
-        focus_area = row.get('focus_area', 'Unknown')
-        source = row.get('source', 'Unknown')
-        # Split answer into chunks
-        answer_chunks = splitter.split_text(answer)
-        for chunk_local_id, chunk_text in enumerate(answer_chunks):
-            chunks.append({
-                'question_id': question_id_counter,
-                'chunk_id': f"{question_id_counter}_{chunk_local_id}",
-                'question': question,
-                'chunk_answer': chunk_text,
-                'focus_area': focus_area,
-                'source': source,
-            })
-        question_id_counter += 1
-    return chunks
-def create_embeddings(documents, embed_model):
-    """Create embeddings for documents using SentenceTransformer.
-    Args:
-        documents (list): List of text documents to embed
-        embed_model: SentenceTransformer model instance
-    Returns:
-        np.ndarray: Embeddings array
-    """
-    embeddings = embed_model.encode(
-        documents,
-        batch_size=64,
-        show_progress_bar=False,
-        convert_to_numpy=True,
-        normalize_embeddings=True
-    )
-    return embeddings