juhoinkinen commited on
Commit
6512684
·
verified ·
1 Parent(s): 594ce98

Upload folder using huggingface_hub

Browse files
convert-dataset-to-annif-corpus.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+
3
+ from rdflib import Graph
4
+ from rdflib.namespace import SKOS
5
+
6
+
7
+ csv.field_size_limit(131072 * 100)
8
+
9
+
10
+ def load_eurovoc_notations(graph_filename):
11
+ g = Graph()
12
+ g.parse(graph_filename, format='xml')
13
+ print("Graph parsed")
14
+
15
+ result = {}
16
+ for concept, notation in g.subject_objects(SKOS.notation):
17
+ result[str(notation)] = str(concept)
18
+ # result = {}
19
+ return result
20
+
21
+
22
+ def convert_ids_to_uris(ids):
23
+ uris = []
24
+ for id in ids:
25
+ if len(id) == 6:
26
+ uris.append(
27
+ f'<http://eurovoc.europa.eu/{id.lstrip("0")}>'
28
+ )
29
+
30
+ else:
31
+ mapped_uri = notatation_uri_map[id]
32
+ if mapped_uri is not None:
33
+ uris.append(
34
+ "<" + mapped_uri + ">"
35
+ )
36
+ else:
37
+ print(f"Could not map notation {id} to uri")
38
+ return uris
39
+
40
+
41
+ def cleanup(text):
42
+ return " ".join(text.replace('"', ' ').split())
43
+
44
+
45
+ def convert_eurovoc_csv(input_file, output_file):
46
+ """
47
+ Convert CSV file with eurovoc subject URIs to proper format.
48
+
49
+ Args:
50
+ input_file (str): Path to input CSV file
51
+ output_file (str): Path to output CSV file
52
+ """
53
+
54
+ # Print out header to CSV
55
+ with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
56
+ print("document_id,url,date,type,title,text,subject_uris", file=outfile)
57
+
58
+
59
+ # Read the input CSV and process each row
60
+ with open(input_file, 'r', newline='', encoding='utf-8') as infile:
61
+ reader = csv.DictReader(infile)
62
+
63
+ for row in reader:
64
+ # Create a new row with processed data
65
+ processed_row = row.copy()
66
+
67
+ # Quote the title and text fields
68
+ processed_row['title'] = f'"{cleanup(row["title"])}"'
69
+ processed_row['text'] = f'"{cleanup(row["text"])}"'
70
+
71
+ # Process subject_uris - convert IDs to full URIs
72
+ if row['subject_uris']:
73
+ ids = row['subject_uris'].split(';')
74
+ uris = convert_ids_to_uris(ids)
75
+ processed_row['subject_uris'] = '"' + ' '.join(uris) + '"'
76
+ else:
77
+ processed_row['subject_uris'] = '""'
78
+
79
+ # Write to output CSV
80
+ with open(output_file, 'a', newline='', encoding='utf-8') as outfile:
81
+ print(",".join(processed_row.values()), file=outfile)
82
+
83
+ # processed_rows.append(processed_row)
84
+ # if len(processed_rows) >= 10:
85
+ # break
86
+
87
+
88
+ # Usage example:
89
+ if __name__ == "__main__":
90
+ notatation_uri_map = load_eurovoc_notations("eurovoc-skos-ap-eu.rdf")
91
+ convert_eurovoc_csv("european_parliament_eurovoc_2025_en.csv", 'european_parliament_eurovoc_2025_en_converted.csv')
convert-output-to-xlsx.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ from collections import defaultdict
4
+
5
+ def process_files_to_excel(file_paths, output_file):
6
+ """
7
+ Convert multiple text files into a single Excel file with specified structure.
8
+
9
+ Args:
10
+ file_paths (list): List of paths to input text files
11
+ output_file (str): Path to output Excel file
12
+ """
13
+
14
+ # Prepare data for the DataFrame
15
+ rows = []
16
+ max_subjects = 0
17
+
18
+ # Process each file
19
+ for file_path in file_paths:
20
+ # Extract filename without path and extension
21
+ filename = os.path.basename(file_path)
22
+
23
+ # Read file content
24
+ with open(file_path, 'r', encoding='utf-8') as f:
25
+ lines = f.readlines()
26
+
27
+ # Process lines - extract first two columns only
28
+ file_data = []
29
+ for line in lines:
30
+ if line.strip(): # Skip empty lines
31
+ parts = line.strip().split('\t')
32
+ if len(parts) >= 2:
33
+ # Remove <> brackets from URI if present
34
+ uri = parts[0].replace('<', '').replace('>', '')
35
+ label = parts[1]
36
+ file_data.append((uri, label))
37
+
38
+ # Create row with filename and vocabulary (based on filename or content)
39
+ row = {'File name': filename.replace('.annif','.txt')}
40
+
41
+ row['Vocabulary'] = 'EuroVoc'
42
+ # Determine vocabulary based on filename pattern
43
+ # if 'eurovoc' in filename.lower():
44
+ # row['Vocabulary'] = 'EuroVoc'
45
+ # elif 'det' in filename.lower():
46
+ # row['Vocabulary'] = 'DET'
47
+ # else:
48
+ # row['Vocabulary'] = '' # Default empty if not recognized
49
+
50
+ # Add tag data
51
+ for i, (uri, label) in enumerate(file_data):
52
+ row[f'Tag label(en) {i+1}'] = label
53
+ row[f'Tag id {i+1}'] = uri
54
+ rows.append(row)
55
+ max_subjects = max(max_subjects, len(file_data))
56
+
57
+ # Create DataFrame with all columns in order
58
+ all_columns = ['File name', 'Vocabulary']
59
+ # Add pairs of columns for each subject
60
+ for i in range(max_subjects):
61
+ all_columns.extend([f'Tag label(en) {i+1}', f'Tag id {i+1}'])
62
+
63
+ df = pd.DataFrame(rows, columns=all_columns)
64
+
65
+ # Save to Excel
66
+ df.to_excel(output_file, index=False)
67
+ print(f"Excel file saved as: {output_file}")
68
+
69
+ def main():
70
+ # Example usage - replace with your actual file paths
71
+ # You can also pass a directory to automatically find all txt files
72
+
73
+ # Option 1: Specify exact file paths
74
+ # file_paths = [
75
+ # 'GS0001.txt',
76
+ # 'GS0002.txt',
77
+ # 'GS0003.txt',
78
+ # 'GS0004.txt',
79
+ # 'GS0005.txt'
80
+ # ]
81
+
82
+ # Option 2: Automatically find all .txt files in current directory
83
+ file_paths = [f for f in os.listdir('.') if f.endswith('.annif')]
84
+
85
+ output_file = 'output.xlsx'
86
+
87
+ process_files_to_excel(file_paths, output_file)
88
+
89
+ if __name__ == "__main__":
90
+ main()
download-dataset.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import csv
3
+ import time
4
+
5
+ # Stream the dataset to avoid full download
6
+ dataset = load_dataset("EuropeanParliament/Eurovoc_2025", split="train", streaming=True)
7
+ output_file = "eurovoc_en.csv"
8
+
9
+ # Define output columns in Annif CSV format
10
+ output_columns = ["document_id", "url", "date", "type", "title", "text", "subject_uris"]
11
+
12
+ # Open CSV and write header
13
+ with open(output_file, mode="w", newline="", encoding="utf-8") as f:
14
+ writer = csv.DictWriter(f, fieldnames=output_columns)
15
+ writer.writeheader()
16
+
17
+ count = 0
18
+ for row in dataset:
19
+ # Filter for English documents with Eurovoc concepts
20
+ if row.get("language") == "EN" and len(row.get("eurovoc_concepts", [])) > 0:
21
+ # Map dataset fields to output columns
22
+ out_row = {
23
+ "document_id": row.get("reference", ""),
24
+ "url": row.get("url", ""),
25
+ "date": row.get("date", ""),
26
+ "type": row.get("type", ""),
27
+ "title": row.get("title", ""),
28
+ "text": row.get("text", ""),
29
+ "subject_uris": ";".join(row.get("eurovoc_concepts", []))
30
+ }
31
+ writer.writerow(out_row)
32
+ count += 1
33
+
34
+ if count % 100 == 0:
35
+ print(f"Processed {count} rows, sleeping 1 seconds...")
36
+ time.sleep(1)
37
+
38
+ print(f"✅ Saved {count} rows to {output_file}")