Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

convert-dataset-to-annif-corpus.py +91 -0
convert-output-to-xlsx.py +90 -0
download-dataset.py +38 -0

convert-dataset-to-annif-corpus.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import csv
+from rdflib import Graph
+from rdflib.namespace import SKOS
+csv.field_size_limit(131072 * 100)
+def load_eurovoc_notations(graph_filename):
+    g = Graph()
+    g.parse(graph_filename, format='xml')
+    print("Graph parsed")
+    result = {}
+    for concept, notation in g.subject_objects(SKOS.notation):
+        result[str(notation)] = str(concept)
+    # result = {}
+    return result
+def convert_ids_to_uris(ids):
+    uris = []
+    for id in ids:
+        if len(id) == 6:
+            uris.append(
+                f'<http://eurovoc.europa.eu/{id.lstrip("0")}>'
+            )
+        else:
+            mapped_uri = notatation_uri_map[id]
+            if mapped_uri is not None:
+                uris.append(
+                    "<" + mapped_uri + ">"
+                )
+            else:
+                print(f"Could not map notation {id} to uri")
+    return uris
+def cleanup(text):
+    return " ".join(text.replace('"', ' ').split())
+def convert_eurovoc_csv(input_file, output_file):
+    """
+    Convert CSV file with eurovoc subject URIs to proper format.
+    Args:
+        input_file (str): Path to input CSV file
+        output_file (str): Path to output CSV file
+    """
+    # Print out header to CSV
+    with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
+        print("document_id,url,date,type,title,text,subject_uris", file=outfile)
+    # Read the input CSV and process each row
+    with open(input_file, 'r', newline='', encoding='utf-8') as infile:
+        reader = csv.DictReader(infile)
+        for row in reader:
+            # Create a new row with processed data
+            processed_row = row.copy()
+            # Quote the title and text fields
+            processed_row['title'] = f'"{cleanup(row["title"])}"'
+            processed_row['text'] = f'"{cleanup(row["text"])}"'
+            # Process subject_uris - convert IDs to full URIs
+            if row['subject_uris']:
+                ids = row['subject_uris'].split(';')
+                uris = convert_ids_to_uris(ids)
+                processed_row['subject_uris'] =  '"' + ' '.join(uris) + '"'
+            else:
+                processed_row['subject_uris'] = '""'
+            # Write to output CSV
+            with open(output_file, 'a', newline='', encoding='utf-8') as outfile:
+                print(",".join(processed_row.values()), file=outfile)
+            # processed_rows.append(processed_row)
+            # if len(processed_rows) >= 10:
+            #     break
+# Usage example:
+if __name__ == "__main__":
+    notatation_uri_map = load_eurovoc_notations("eurovoc-skos-ap-eu.rdf")
+    convert_eurovoc_csv("european_parliament_eurovoc_2025_en.csv", 'european_parliament_eurovoc_2025_en_converted.csv')

convert-output-to-xlsx.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import os
+import pandas as pd
+from collections import defaultdict
+def process_files_to_excel(file_paths, output_file):
+    """
+    Convert multiple text files into a single Excel file with specified structure.
+    Args:
+        file_paths (list): List of paths to input text files
+        output_file (str): Path to output Excel file
+    """
+    # Prepare data for the DataFrame
+    rows = []
+    max_subjects = 0
+    # Process each file
+    for file_path in file_paths:
+        # Extract filename without path and extension
+        filename = os.path.basename(file_path)
+        # Read file content
+        with open(file_path, 'r', encoding='utf-8') as f:
+            lines = f.readlines()
+        # Process lines - extract first two columns only
+        file_data = []
+        for line in lines:
+            if line.strip():  # Skip empty lines
+                parts = line.strip().split('\t')
+                if len(parts) >= 2:
+                    # Remove <> brackets from URI if present
+                    uri = parts[0].replace('<', '').replace('>', '')
+                    label = parts[1]
+                    file_data.append((uri, label))
+        # Create row with filename and vocabulary (based on filename or content)
+        row = {'File name': filename.replace('.annif','.txt')}
+        row['Vocabulary'] = 'EuroVoc'
+        # Determine vocabulary based on filename pattern
+        # if 'eurovoc' in filename.lower():
+        #     row['Vocabulary'] = 'EuroVoc'
+        # elif 'det' in filename.lower():
+        #     row['Vocabulary'] = 'DET'
+        # else:
+        #     row['Vocabulary'] = ''  # Default empty if not recognized
+        # Add tag data
+        for i, (uri, label) in enumerate(file_data):
+            row[f'Tag label(en) {i+1}'] = label
+            row[f'Tag id {i+1}'] = uri
+        rows.append(row)
+        max_subjects = max(max_subjects, len(file_data))
+    # Create DataFrame with all columns in order
+    all_columns = ['File name', 'Vocabulary']
+    # Add pairs of columns for each subject
+    for i in range(max_subjects):
+        all_columns.extend([f'Tag label(en) {i+1}', f'Tag id {i+1}'])
+    df = pd.DataFrame(rows, columns=all_columns)
+    # Save to Excel
+    df.to_excel(output_file, index=False)
+    print(f"Excel file saved as: {output_file}")
+def main():
+    # Example usage - replace with your actual file paths
+    # You can also pass a directory to automatically find all txt files
+    # Option 1: Specify exact file paths
+    # file_paths = [
+    #     'GS0001.txt',
+    #     'GS0002.txt',
+    #     'GS0003.txt',
+    #     'GS0004.txt',
+    #     'GS0005.txt'
+    # ]
+    # Option 2: Automatically find all .txt files in current directory
+    file_paths = [f for f in os.listdir('.') if f.endswith('.annif')]
+    output_file = 'output.xlsx'
+    process_files_to_excel(file_paths, output_file)
+if __name__ == "__main__":
+    main()

download-dataset.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from datasets import load_dataset
+import csv
+import time
+# Stream the dataset to avoid full download
+dataset = load_dataset("EuropeanParliament/Eurovoc_2025", split="train", streaming=True)
+output_file = "eurovoc_en.csv"
+# Define output columns in Annif CSV format
+output_columns = ["document_id", "url", "date", "type", "title", "text", "subject_uris"]
+# Open CSV and write header
+with open(output_file, mode="w", newline="", encoding="utf-8") as f:
+    writer = csv.DictWriter(f, fieldnames=output_columns)
+    writer.writeheader()
+    count = 0
+    for row in dataset:
+        # Filter for English documents with Eurovoc concepts
+        if row.get("language") == "EN" and len(row.get("eurovoc_concepts", [])) > 0:
+            # Map dataset fields to output columns
+            out_row = {
+                "document_id": row.get("reference", ""),
+                "url": row.get("url", ""),
+                "date": row.get("date", ""),
+                "type": row.get("type", ""),
+                "title": row.get("title", ""),
+                "text": row.get("text", ""),
+                "subject_uris": ";".join(row.get("eurovoc_concepts", []))
+            }
+            writer.writerow(out_row)
+            count += 1
+            if count % 100 == 0:
+                print(f"Processed {count} rows, sleeping 1 seconds...")
+                time.sleep(1)
+print(f"✅ Saved {count} rows to {output_file}")