p2pclaw-api / packages /api /src /utils /validationUtils.js
Frank-Agnuxo's picture
feat: P2PCLAW API for HF Spaces — ChessBoard Reasoning Engine + full API
e92be04
/**
* Extraction of a section from markdown content.
*/
export function extractSection(content, sectionName) {
const escaped = sectionName.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
const pattern = new RegExp(`${escaped}\\s*([\\s\\S]*?)(?=\\n## |$)`);
const match = content.match(pattern);
return match ? match[1].trim() : "";
}
/**
* Structural and semantic validation of a paper.
* Returns { valid, score, details }
*
* Scoring (100 points total):
* A. Structure — 40 pts: all 7 required sections present
* B. Length — 20 pts: >= 1500 words (~2000 tokens)
* C. References — 20 pts: >= 3 [N] citations
* D. Coherence — 20 pts: keyword overlap between abstract and conclusion
*/
export function validatePaper(paper) {
const content = paper.content || "";
// A. Section structure (40 pts)
const REQUIRED_SECTIONS = [
"## Abstract", "## Introduction", "## Methodology",
"## Results", "## Discussion", "## Conclusion", "## References"
];
const foundSections = REQUIRED_SECTIONS.filter(s => content.includes(s));
const sectionScore = (foundSections.length / 7) * 40;
// B. Word count (20 pts) — target: 1500 words minimum (~2000 tokens)
const words = content.split(/\s+/).filter(w => w.length > 0).length;
const wordScore = Math.min((words / 1500) * 20, 20);
// C. References (20 pts)
const refs = (content.match(/\[\d+\]/g) || []).length;
const refScore = Math.min((refs / 3) * 20, 20);
// D. Semantic coherence: abstract keywords present in conclusion (20 pts)
const abstract = extractSection(content, "## Abstract");
const conclusion = extractSection(content, "## Conclusion");
const rawKeywords = abstract.toLowerCase().match(/\b\w{5,}\b/g) || [];
const unique = [...new Set(rawKeywords)].slice(0, 20);
// Filter stop words
const stopWords = new Set(["which", "their", "there", "these", "those", "where",
"about", "after", "before", "during", "through", "between", "under",
"above", "below", "while", "being", "using", "based", "with", "from"]);
const keywords = unique.filter(kw => !stopWords.has(kw));
const overlap = keywords.filter(kw => conclusion.toLowerCase().includes(kw)).length;
const coherenceScore = keywords.length > 0
? (overlap / keywords.length) * 20
: 10; // neutral if abstract is too short
const total = sectionScore + wordScore + refScore + coherenceScore;
const score = parseFloat((total / 100).toFixed(3));
return {
valid: total >= 60,
score,
details: {
sections: `${foundSections.length}/7`,
words,
refs,
coherence: keywords.length > 0
? `${overlap}/${keywords.length} keywords`
: "N/A",
breakdown: {
structure: parseFloat(sectionScore.toFixed(1)),
length: parseFloat(wordScore.toFixed(1)),
references: parseFloat(refScore.toFixed(1)),
coherence: parseFloat(coherenceScore.toFixed(1))
}
}
};
}