Spaces:
Sleeping
Sleeping
File size: 3,719 Bytes
db06013 0a02cd7 db06013 0a02cd7 db06013 0a02cd7 db06013 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
from typing import List, Dict, Any
import re
import logging
logger = logging.getLogger(__name__)
class Preprocessor:
def __init__(self):
"""Initialize preprocessor without external dependencies"""
pass
def clean_text(self, text: str) -> str:
"""Clean and normalize text"""
if not text:
return ""
# Remove extra whitespace
text = text.strip()
text = re.sub(r'\s+', ' ', text)
# Remove special characters but keep punctuation
text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', '', text)
return text.strip()
def extract_sentences(self, text: str) -> List[str]:
"""Extract sentences from text (simplified version without NLTK)"""
if not text:
return []
# Simple sentence splitting based on punctuation
sentences = re.split(r'[.!?]+', text)
sentences = [s.strip() for s in sentences if s.strip()]
return sentences
def tokenize(self, text: str) -> List[str]:
"""Tokenize text into words (simplified version)"""
if not text:
return []
# Simple word tokenization
words = re.findall(r'\b\w+\b', text.lower())
return words
def preprocess_passages(self, passages: List[str]) -> List[Dict[str, Any]]:
"""Preprocess a list of passages"""
processed = []
for i, passage in enumerate(passages):
if not passage:
continue
cleaned = self.clean_text(passage)
sentences = self.extract_sentences(cleaned)
tokens = self.tokenize(cleaned)
processed.append({
'id': i,
'text': cleaned,
'sentences': sentences,
'tokens': tokens,
'length': len(tokens)
})
return processed
def preprocess_qa_data(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Preprocess QA data, auto convert dict/list fields to string"""
processed = []
def to_str(val):
if isinstance(val, dict):
# 拼接所有value
return " ".join([to_str(v) for v in val.values()])
elif isinstance(val, list):
return " ".join([to_str(v) for v in val])
elif val is None:
return ""
return str(val)
for item in data:
if not isinstance(item, dict):
continue
question = to_str(item.get('question', ''))
answer = to_str(item.get('answer', ''))
context = to_str(item.get('context', ''))
processed_item = {
'question': self.clean_text(question),
'answer': self.clean_text(answer),
'context': self.clean_text(context),
'question_tokens': self.tokenize(question),
'answer_tokens': self.tokenize(answer),
'context_tokens': self.tokenize(context)
}
processed.append(processed_item)
return processed
def create_chunks(self, text: str, chunk_size: int = 512, overlap: int = 50) -> List[str]:
"""Create overlapping text chunks"""
if not text:
return []
tokens = self.tokenize(text)
chunks = []
for i in range(0, len(tokens), chunk_size - overlap):
chunk_tokens = tokens[i:i + chunk_size]
chunk_text = ' '.join(chunk_tokens)
chunks.append(chunk_text)
return chunks |