File size: 3,156 Bytes
e92be04
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
/**
 * Extraction of a section from markdown content.
 */
export function extractSection(content, sectionName) {
    const escaped = sectionName.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
    const pattern = new RegExp(`${escaped}\\s*([\\s\\S]*?)(?=\\n## |$)`);
    const match = content.match(pattern);
    return match ? match[1].trim() : "";
}

/**
 * Structural and semantic validation of a paper.
 * Returns { valid, score, details }
 *
 * Scoring (100 points total):
 *   A. Structure   — 40 pts: all 7 required sections present
 *   B. Length      — 20 pts: >= 1500 words (~2000 tokens)
 *   C. References  — 20 pts: >= 3 [N] citations
 *   D. Coherence   — 20 pts: keyword overlap between abstract and conclusion
 */
export function validatePaper(paper) {
    const content = paper.content || "";

    // A. Section structure (40 pts)
    const REQUIRED_SECTIONS = [
        "## Abstract", "## Introduction", "## Methodology",
        "## Results", "## Discussion", "## Conclusion", "## References"
    ];
    const foundSections = REQUIRED_SECTIONS.filter(s => content.includes(s));
    const sectionScore = (foundSections.length / 7) * 40;

    // B. Word count (20 pts) — target: 1500 words minimum (~2000 tokens)
    const words = content.split(/\s+/).filter(w => w.length > 0).length;
    const wordScore = Math.min((words / 1500) * 20, 20);

    // C. References (20 pts)
    const refs = (content.match(/\[\d+\]/g) || []).length;
    const refScore = Math.min((refs / 3) * 20, 20);

    // D. Semantic coherence: abstract keywords present in conclusion (20 pts)
    const abstract = extractSection(content, "## Abstract");
    const conclusion = extractSection(content, "## Conclusion");
    const rawKeywords = abstract.toLowerCase().match(/\b\w{5,}\b/g) || [];
    const unique = [...new Set(rawKeywords)].slice(0, 20);
    // Filter stop words
    const stopWords = new Set(["which", "their", "there", "these", "those", "where",
        "about", "after", "before", "during", "through", "between", "under",
        "above", "below", "while", "being", "using", "based", "with", "from"]);
    const keywords = unique.filter(kw => !stopWords.has(kw));
    const overlap = keywords.filter(kw => conclusion.toLowerCase().includes(kw)).length;
    const coherenceScore = keywords.length > 0
        ? (overlap / keywords.length) * 20
        : 10; // neutral if abstract is too short

    const total = sectionScore + wordScore + refScore + coherenceScore;
    const score = parseFloat((total / 100).toFixed(3));

    return {
        valid: total >= 60,
        score,
        details: {
            sections: `${foundSections.length}/7`,
            words,
            refs,
            coherence: keywords.length > 0
                ? `${overlap}/${keywords.length} keywords`
                : "N/A",
            breakdown: {
                structure: parseFloat(sectionScore.toFixed(1)),
                length: parseFloat(wordScore.toFixed(1)),
                references: parseFloat(refScore.toFixed(1)),
                coherence: parseFloat(coherenceScore.toFixed(1))
            }
        }
    };
}