Spaces:
Sleeping
Sleeping
| from typing import Dict, List | |
| from Tokenization.preprocessing.Clean_text import clean_text | |
| from Tokenization.preprocessing.Segment_paragraphs import segment_paragraphs | |
| def preprocess_sample(paper: Dict) -> List[Dict]: | |
| """ | |
| Clean and segment a paper into samples for LLM ingestion. | |
| Returns a list of dicts: one for title+abstract, and one per paragraph. | |
| """ | |
| title = clean_text(paper.get("title", "")) | |
| abstract = clean_text(paper.get("abstract", "")) | |
| full_text = clean_text(paper.get("full_text", "")) | |
| paragraphs = segment_paragraphs(full_text) if full_text else [] | |
| samples = [] | |
| # Title + abstract sample | |
| if title or abstract: | |
| sample = dict(paper) | |
| sample["title"] = title | |
| sample["abstract"] = abstract | |
| sample["full_text"] = "" | |
| sample["section"] = "abstract" | |
| samples.append(sample) | |
| # Paragraph samples | |
| for para in paragraphs: | |
| sample = dict(paper) | |
| sample["title"] = title | |
| sample["abstract"] = "" | |
| sample["full_text"] = para | |
| sample["section"] = "paragraph" | |
| samples.append(sample) | |
| return samples | |