Upload folder using huggingface_hub
Browse files- convert-dataset-to-annif-corpus.py +91 -0
- convert-output-to-xlsx.py +90 -0
- download-dataset.py +38 -0
convert-dataset-to-annif-corpus.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import csv
|
| 2 |
+
|
| 3 |
+
from rdflib import Graph
|
| 4 |
+
from rdflib.namespace import SKOS
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
csv.field_size_limit(131072 * 100)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def load_eurovoc_notations(graph_filename):
|
| 11 |
+
g = Graph()
|
| 12 |
+
g.parse(graph_filename, format='xml')
|
| 13 |
+
print("Graph parsed")
|
| 14 |
+
|
| 15 |
+
result = {}
|
| 16 |
+
for concept, notation in g.subject_objects(SKOS.notation):
|
| 17 |
+
result[str(notation)] = str(concept)
|
| 18 |
+
# result = {}
|
| 19 |
+
return result
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def convert_ids_to_uris(ids):
|
| 23 |
+
uris = []
|
| 24 |
+
for id in ids:
|
| 25 |
+
if len(id) == 6:
|
| 26 |
+
uris.append(
|
| 27 |
+
f'<http://eurovoc.europa.eu/{id.lstrip("0")}>'
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
else:
|
| 31 |
+
mapped_uri = notatation_uri_map[id]
|
| 32 |
+
if mapped_uri is not None:
|
| 33 |
+
uris.append(
|
| 34 |
+
"<" + mapped_uri + ">"
|
| 35 |
+
)
|
| 36 |
+
else:
|
| 37 |
+
print(f"Could not map notation {id} to uri")
|
| 38 |
+
return uris
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def cleanup(text):
|
| 42 |
+
return " ".join(text.replace('"', ' ').split())
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def convert_eurovoc_csv(input_file, output_file):
|
| 46 |
+
"""
|
| 47 |
+
Convert CSV file with eurovoc subject URIs to proper format.
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
input_file (str): Path to input CSV file
|
| 51 |
+
output_file (str): Path to output CSV file
|
| 52 |
+
"""
|
| 53 |
+
|
| 54 |
+
# Print out header to CSV
|
| 55 |
+
with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
|
| 56 |
+
print("document_id,url,date,type,title,text,subject_uris", file=outfile)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
# Read the input CSV and process each row
|
| 60 |
+
with open(input_file, 'r', newline='', encoding='utf-8') as infile:
|
| 61 |
+
reader = csv.DictReader(infile)
|
| 62 |
+
|
| 63 |
+
for row in reader:
|
| 64 |
+
# Create a new row with processed data
|
| 65 |
+
processed_row = row.copy()
|
| 66 |
+
|
| 67 |
+
# Quote the title and text fields
|
| 68 |
+
processed_row['title'] = f'"{cleanup(row["title"])}"'
|
| 69 |
+
processed_row['text'] = f'"{cleanup(row["text"])}"'
|
| 70 |
+
|
| 71 |
+
# Process subject_uris - convert IDs to full URIs
|
| 72 |
+
if row['subject_uris']:
|
| 73 |
+
ids = row['subject_uris'].split(';')
|
| 74 |
+
uris = convert_ids_to_uris(ids)
|
| 75 |
+
processed_row['subject_uris'] = '"' + ' '.join(uris) + '"'
|
| 76 |
+
else:
|
| 77 |
+
processed_row['subject_uris'] = '""'
|
| 78 |
+
|
| 79 |
+
# Write to output CSV
|
| 80 |
+
with open(output_file, 'a', newline='', encoding='utf-8') as outfile:
|
| 81 |
+
print(",".join(processed_row.values()), file=outfile)
|
| 82 |
+
|
| 83 |
+
# processed_rows.append(processed_row)
|
| 84 |
+
# if len(processed_rows) >= 10:
|
| 85 |
+
# break
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
# Usage example:
|
| 89 |
+
if __name__ == "__main__":
|
| 90 |
+
notatation_uri_map = load_eurovoc_notations("eurovoc-skos-ap-eu.rdf")
|
| 91 |
+
convert_eurovoc_csv("european_parliament_eurovoc_2025_en.csv", 'european_parliament_eurovoc_2025_en_converted.csv')
|
convert-output-to-xlsx.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from collections import defaultdict
|
| 4 |
+
|
| 5 |
+
def process_files_to_excel(file_paths, output_file):
|
| 6 |
+
"""
|
| 7 |
+
Convert multiple text files into a single Excel file with specified structure.
|
| 8 |
+
|
| 9 |
+
Args:
|
| 10 |
+
file_paths (list): List of paths to input text files
|
| 11 |
+
output_file (str): Path to output Excel file
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
# Prepare data for the DataFrame
|
| 15 |
+
rows = []
|
| 16 |
+
max_subjects = 0
|
| 17 |
+
|
| 18 |
+
# Process each file
|
| 19 |
+
for file_path in file_paths:
|
| 20 |
+
# Extract filename without path and extension
|
| 21 |
+
filename = os.path.basename(file_path)
|
| 22 |
+
|
| 23 |
+
# Read file content
|
| 24 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 25 |
+
lines = f.readlines()
|
| 26 |
+
|
| 27 |
+
# Process lines - extract first two columns only
|
| 28 |
+
file_data = []
|
| 29 |
+
for line in lines:
|
| 30 |
+
if line.strip(): # Skip empty lines
|
| 31 |
+
parts = line.strip().split('\t')
|
| 32 |
+
if len(parts) >= 2:
|
| 33 |
+
# Remove <> brackets from URI if present
|
| 34 |
+
uri = parts[0].replace('<', '').replace('>', '')
|
| 35 |
+
label = parts[1]
|
| 36 |
+
file_data.append((uri, label))
|
| 37 |
+
|
| 38 |
+
# Create row with filename and vocabulary (based on filename or content)
|
| 39 |
+
row = {'File name': filename.replace('.annif','.txt')}
|
| 40 |
+
|
| 41 |
+
row['Vocabulary'] = 'EuroVoc'
|
| 42 |
+
# Determine vocabulary based on filename pattern
|
| 43 |
+
# if 'eurovoc' in filename.lower():
|
| 44 |
+
# row['Vocabulary'] = 'EuroVoc'
|
| 45 |
+
# elif 'det' in filename.lower():
|
| 46 |
+
# row['Vocabulary'] = 'DET'
|
| 47 |
+
# else:
|
| 48 |
+
# row['Vocabulary'] = '' # Default empty if not recognized
|
| 49 |
+
|
| 50 |
+
# Add tag data
|
| 51 |
+
for i, (uri, label) in enumerate(file_data):
|
| 52 |
+
row[f'Tag label(en) {i+1}'] = label
|
| 53 |
+
row[f'Tag id {i+1}'] = uri
|
| 54 |
+
rows.append(row)
|
| 55 |
+
max_subjects = max(max_subjects, len(file_data))
|
| 56 |
+
|
| 57 |
+
# Create DataFrame with all columns in order
|
| 58 |
+
all_columns = ['File name', 'Vocabulary']
|
| 59 |
+
# Add pairs of columns for each subject
|
| 60 |
+
for i in range(max_subjects):
|
| 61 |
+
all_columns.extend([f'Tag label(en) {i+1}', f'Tag id {i+1}'])
|
| 62 |
+
|
| 63 |
+
df = pd.DataFrame(rows, columns=all_columns)
|
| 64 |
+
|
| 65 |
+
# Save to Excel
|
| 66 |
+
df.to_excel(output_file, index=False)
|
| 67 |
+
print(f"Excel file saved as: {output_file}")
|
| 68 |
+
|
| 69 |
+
def main():
|
| 70 |
+
# Example usage - replace with your actual file paths
|
| 71 |
+
# You can also pass a directory to automatically find all txt files
|
| 72 |
+
|
| 73 |
+
# Option 1: Specify exact file paths
|
| 74 |
+
# file_paths = [
|
| 75 |
+
# 'GS0001.txt',
|
| 76 |
+
# 'GS0002.txt',
|
| 77 |
+
# 'GS0003.txt',
|
| 78 |
+
# 'GS0004.txt',
|
| 79 |
+
# 'GS0005.txt'
|
| 80 |
+
# ]
|
| 81 |
+
|
| 82 |
+
# Option 2: Automatically find all .txt files in current directory
|
| 83 |
+
file_paths = [f for f in os.listdir('.') if f.endswith('.annif')]
|
| 84 |
+
|
| 85 |
+
output_file = 'output.xlsx'
|
| 86 |
+
|
| 87 |
+
process_files_to_excel(file_paths, output_file)
|
| 88 |
+
|
| 89 |
+
if __name__ == "__main__":
|
| 90 |
+
main()
|
download-dataset.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datasets import load_dataset
|
| 2 |
+
import csv
|
| 3 |
+
import time
|
| 4 |
+
|
| 5 |
+
# Stream the dataset to avoid full download
|
| 6 |
+
dataset = load_dataset("EuropeanParliament/Eurovoc_2025", split="train", streaming=True)
|
| 7 |
+
output_file = "eurovoc_en.csv"
|
| 8 |
+
|
| 9 |
+
# Define output columns in Annif CSV format
|
| 10 |
+
output_columns = ["document_id", "url", "date", "type", "title", "text", "subject_uris"]
|
| 11 |
+
|
| 12 |
+
# Open CSV and write header
|
| 13 |
+
with open(output_file, mode="w", newline="", encoding="utf-8") as f:
|
| 14 |
+
writer = csv.DictWriter(f, fieldnames=output_columns)
|
| 15 |
+
writer.writeheader()
|
| 16 |
+
|
| 17 |
+
count = 0
|
| 18 |
+
for row in dataset:
|
| 19 |
+
# Filter for English documents with Eurovoc concepts
|
| 20 |
+
if row.get("language") == "EN" and len(row.get("eurovoc_concepts", [])) > 0:
|
| 21 |
+
# Map dataset fields to output columns
|
| 22 |
+
out_row = {
|
| 23 |
+
"document_id": row.get("reference", ""),
|
| 24 |
+
"url": row.get("url", ""),
|
| 25 |
+
"date": row.get("date", ""),
|
| 26 |
+
"type": row.get("type", ""),
|
| 27 |
+
"title": row.get("title", ""),
|
| 28 |
+
"text": row.get("text", ""),
|
| 29 |
+
"subject_uris": ";".join(row.get("eurovoc_concepts", []))
|
| 30 |
+
}
|
| 31 |
+
writer.writerow(out_row)
|
| 32 |
+
count += 1
|
| 33 |
+
|
| 34 |
+
if count % 100 == 0:
|
| 35 |
+
print(f"Processed {count} rows, sleeping 1 seconds...")
|
| 36 |
+
time.sleep(1)
|
| 37 |
+
|
| 38 |
+
print(f"✅ Saved {count} rows to {output_file}")
|