Spaces:
Build error
Build error
| # data_ingestion/preprocess_data.py | |
| import re | |
| def preprocess_text(data): | |
| """Cleans extracted text data to retain only the author and their message.""" | |
| cleaned_data = [] | |
| for line in data: | |
| # Match pattern with author and text (e.g., "Author: Message") | |
| match = re.match(r"^(.*?):\s+(.*)$", line) | |
| if match: | |
| author, text = match.groups() | |
| cleaned_data.append({"author": author.strip(), "text": text.strip()}) | |
| return cleaned_data | |