File size: 6,697 Bytes
0ec6423
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f45f4a9
0ec6423
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d288030
 
 
0ec6423
d288030
0ec6423
 
 
 
 
 
f45f4a9
 
0ec6423
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d288030
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ec6423
d288030
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ec6423
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
/**
 * LaTeX Metadata Extractor
 * Extracts document metadata from LaTeX files for frontmatter generation
 */

/**
 * Extract metadata from LaTeX content
 * @param {string} latexContent - Raw LaTeX content
 * @returns {object} - Extracted metadata object
 */
export function extractLatexMetadata(latexContent) {
    const metadata = {};

    // Extract title
    const titleMatch = latexContent.match(/\\title\s*\{\s*([^}]+)\s*\}/s);
    if (titleMatch) {
        metadata.title = titleMatch[1]
            .replace(/\\[a-zA-Z]+/g, '')  // Remove LaTeX commands
            .replace(/\n/g, ' ')
            .trim();
    }

    // Extract authors with their specific affiliations
    const authors = [];
    const authorMatches = latexContent.matchAll(/\\authorOne\[[^\]]*\]\{([^}]+)\}/g);

    for (const match of authorMatches) {
        const fullAuthorInfo = match[1];

        // Determine affiliations based on macros present
        const affiliations = [];
        if (fullAuthorInfo.includes('\\ensps')) {
            affiliations.push(1); // École Normale Supérieure
        }
        if (fullAuthorInfo.includes('\\oxford')) {
            affiliations.push(1); // University of Oxford (index 1 dans le frontmatter)
        }
        if (fullAuthorInfo.includes('\\hf')) {
            affiliations.push(2); // Hugging Face (index 2 dans le frontmatter)
        }

        // Clean author name by removing macros
        let authorName = fullAuthorInfo
            .replace(/\\ensps/g, '')      // Remove École macro
            .replace(/\\hf/g, '')         // Remove Hugging Face macro
            .replace(/\\oxford/g, '')     // Remove Oxford macro
            .replace(/\\[a-zA-Z]+/g, '')  // Remove any other LaTeX commands
            .replace(/\s+/g, ' ')         // Normalize whitespace
            .trim();

        // Skip empty authors or placeholder entries
        if (authorName && authorName !== '...') {
            authors.push({
                name: authorName,
                affiliations: affiliations.length > 0 ? affiliations : [2] // Default to HF if no macro
            });
        }
    }

    if (authors.length > 0) {
        metadata.authors = authors;
    }

    // Extract affiliations dynamically from \contribution command
    const contributionMatch = latexContent.match(/\\contribution\[\]\{([^}]+)\}/);
    if (contributionMatch) {
        const contributionText = contributionMatch[1];
        
        // Parse affiliations from contribution text
        const affiliations = [];
        
        // Split by common separators and clean up
        const parts = contributionText
            .split(/[,;]/)
            .map(part => part.trim())
            .filter(part => part.length > 0);
        
        for (const part of parts) {
            // Remove LaTeX commands and clean up
            const cleanName = part
                .replace(/\\[a-zA-Z]+/g, '')  // Remove LaTeX commands like \oxford, \hf
                .replace(/\s+/g, ' ')         // Normalize whitespace
                .trim();
            
            if (cleanName && cleanName.length > 0) {
                affiliations.push({
                    name: cleanName
                });
            }
        }
        
        if (affiliations.length > 0) {
            metadata.affiliations = affiliations;
        }
    }
    
    // Fallback to hardcoded affiliations if no \contribution found
    if (!metadata.affiliations || metadata.affiliations.length === 0) {
        metadata.affiliations = [
            {
                name: "École Normale Supérieure Paris-Saclay"
            },
            {
                name: "University of Oxford"
            },
            {
                name: "Hugging Face"
            }
        ];
    }

    // Extract date if available (common LaTeX patterns)
    const datePatterns = [
        /\\date\s*\{([^}]+)\}/,
        /\\newcommand\s*\{\\date\}\s*\{([^}]+)\}/,
    ];

    for (const pattern of datePatterns) {
        const dateMatch = latexContent.match(pattern);
        if (dateMatch) {
            metadata.published = dateMatch[1].trim();
            break;
        }
    }

    // Fallback to current date if no date found
    if (!metadata.published) {
        metadata.published = new Date().toLocaleDateString('en-US', {
            year: 'numeric',
            month: 'short',
            day: '2-digit'
        });
    }

    return metadata;
}

/**
 * Generate YAML frontmatter from metadata object
 * @param {object} metadata - Metadata object
 * @returns {string} - YAML frontmatter string
 */
export function generateFrontmatter(metadata) {
    let frontmatter = '---\n';

    // Title
    if (metadata.title) {
        frontmatter += `title: "${metadata.title}"\n`;
    }

    // Authors
    if (metadata.authors && metadata.authors.length > 0) {
        frontmatter += 'authors:\n';
        metadata.authors.forEach(author => {
            frontmatter += `  - name: "${author.name}"\n`;
            if (author.url) {
                frontmatter += `    url: "${author.url}"\n`;
            }
            frontmatter += `    affiliations: [${author.affiliations.join(', ')}]\n`;
        });
    }

    // Affiliations
    if (metadata.affiliations && metadata.affiliations.length > 0) {
        frontmatter += 'affiliations:\n';
        metadata.affiliations.forEach((affiliation, index) => {
            frontmatter += `  - name: "${affiliation.name}"\n`;
            if (affiliation.url) {
                frontmatter += `    url: "${affiliation.url}"\n`;
            }
        });
    }

    // Publication date
    if (metadata.published) {
        frontmatter += `published: "${metadata.published}"\n`;
    }

    // Additional metadata
    if (metadata.doi) {
        frontmatter += `doi: "${metadata.doi}"\n`;
    }

    if (metadata.description) {
        frontmatter += `description: "${metadata.description}"\n`;
    }

    if (metadata.licence) {
        frontmatter += `licence: >\n  ${metadata.licence}\n`;
    }

    if (metadata.tags && metadata.tags.length > 0) {
        frontmatter += 'tags:\n';
        metadata.tags.forEach(tag => {
            frontmatter += `  - ${tag}\n`;
        });
    }

    // Default Astro configuration
    frontmatter += 'tableOfContentsAutoCollapse: true\n';
    frontmatter += '---\n\n';

    return frontmatter;
}

/**
 * Extract and generate frontmatter from LaTeX content
 * @param {string} latexContent - Raw LaTeX content
 * @returns {string} - Complete YAML frontmatter
 */
export function extractAndGenerateFrontmatter(latexContent) {
    const metadata = extractLatexMetadata(latexContent);
    return generateFrontmatter(metadata);
}