Source code for squadds.database.HuggingFace

import os

from datasets import Dataset, concatenate_datasets, load_dataset
from dotenv import load_dotenv
from huggingface_hub import HfApi, Repository, login

from squadds.core.globals import *


[docs] def fork_dataset(repo_id: str, dataset_name: str, new_dataset_name: str, private: bool = True): """ Fork a dataset from Hugging Face Hub. Args: repo_id (str): The repo ID (namespace/repo) of the dataset to fork. dataset_name (str): Name of the dataset to fork. new_dataset_name (str): Name of the new dataset. private (bool): Whether the new dataset should be private or public. Returns: None """ dataset = load_dataset(repo_id, dataset_name) dataset.push_to_hub(repo_id, new_dataset_name, private=private) print(f"Forked dataset '{dataset_name}' to '{new_dataset_name}'.")
[docs] def create_PR(repo_id: str, branch_name: str, title: str, description: str): """ Create a Pull Request (PR) on Hugging Face Hub. Args: repo_id (str): The repo ID (namespace/repo) where the PR will be created. branch_name (str): The branch name where the changes are made. title (str): The title of the PR. description (str): A description of the changes made in the PR. Returns: dict: Information about the created PR. """ api = HfApi() try: pr_info = api.create_pull_request( repo_id=repo_id, head=branch_name, # The branch with your changes title=title, body=description ) print(f"Created PR '{title}' on repo '{repo_id}'.") return pr_info except Exception as e: print(f"Failed to create PR: {e}") raise
# Make sure you are logged in to Hugging Face
[docs] def login_to_huggingface(): """ Log into Hugging Face using an API token from environment variables. """ load_dotenv(ENV_FILE_PATH) # Load environment variables from .env file token = os.getenv("HUGGINGFACE_API_KEY") # Retrieve the token from environment variables if token is None: raise ValueError("Hugging Face API token not found in environment variables.") login(token) print("Successfully logged in to Hugging Face")
# Load the dataset from the Hugging Face Hub
[docs] def load_hf_dataset(dataset_name: str, config: str = None): """ Load a dataset from Hugging Face Hub. Args: dataset_name (str): The name or path of the dataset on the Hugging Face Hub. config (str): Specific configuration or version of the dataset. Returns: Dataset or DatasetDict: Loaded dataset. """ dataset = load_dataset(dataset_name, config) print(f"Loaded dataset: {dataset_name}") return dataset
# Add a new column to the dataset
[docs] def add_column_to_dataset(dataset: Dataset, column_name: str, column_data: list): """ Add a new column to a dataset. Args: dataset (Dataset): Hugging Face dataset to which you want to add a column. column_name (str): Name of the new column. column_data (list): Data for the new column. Returns: Dataset: Dataset with the new column. """ new_dataset = dataset.add_column(column_name, column_data) print(f"Added new column '{column_name}' to dataset.") return new_dataset
# Remove a column from the dataset
[docs] def remove_column_from_dataset(dataset: Dataset, column_name: str): """ Remove a column from a dataset. Args: dataset (Dataset): Hugging Face dataset from which you want to remove a column. column_name (str): Name of the column to remove. Returns: Dataset: Dataset with the column removed. """ new_dataset = dataset.remove_columns([column_name]) print(f"Removed column '{column_name}' from dataset.") return new_dataset
# View a specific column in the dataset
[docs] def view_column_in_dataset(dataset: Dataset, column_name: str, num_values: int): """ View a specific column in the dataset by its name. Args: dataset (Dataset): Hugging Face dataset. column_name (str): Name of the column to view. Returns: list: Data from the specified column. """ if column_name not in dataset.column_names: raise ValueError(f"Column '{column_name}' not found in the dataset.") column_data = dataset[column_name] print(f"Data from column '{column_name}':") print(column_data[:num_values]) # Print values for preview return column_data
# Update a specific column in the dataset
[docs] def update_column_in_dataset(dataset: Dataset, column_name: str, new_column_data: list): """ Update a specific column in the dataset. Args: dataset (Dataset): Hugging Face dataset to update. column_name (str): Name of the column to update. new_column_data (list): List of new data to replace the existing column. Returns: Dataset: Updated dataset. """ if column_name not in dataset.column_names: raise ValueError(f"Column '{column_name}' not found in the dataset.") if len(new_column_data) != len(dataset): raise ValueError(f"The new data length ({len(new_column_data)}) does not match the dataset length ({len(dataset)}).") updated_dataset = dataset.map(lambda x, idx: {column_name: new_column_data[idx]}, with_indices=True) print(f"Updated column '{column_name}' in the dataset.") return updated_dataset
# Add a new row to the dataset
[docs] def add_row_to_dataset(dataset: Dataset, row_data: dict): """ Add a new row to a dataset. Args: dataset (Dataset): The Hugging Face dataset to which you want to add a row. row_data (dict): The row data in dictionary format. Returns: Dataset: Dataset with the new row added. """ # Convert the dataset to a list, append the new row, and convert back to a Dataset new_dataset = Dataset.from_dict({k: dataset[k] + [v] for k, v in row_data.items()}) print("Added new row to dataset.") return new_dataset
# Remove a row from the dataset by index
[docs] def remove_row_from_dataset(dataset: Dataset, row_index: int): """ Remove a row from a dataset by index. Args: dataset (Dataset): Hugging Face dataset from which you want to remove a row. row_index (int): Index of the row to remove. Returns: Dataset: Dataset with the row removed. """ new_dataset = dataset.select([i for i in range(len(dataset)) if i != row_index]) print(f"Removed row at index {row_index} from dataset.") return new_dataset
# Update a row in the dataset
[docs] def update_row_in_dataset(dataset: Dataset, row_index: int, new_row_data: dict): """ Update an existing row in a dataset by index. Args: dataset (Dataset): Hugging Face dataset to update. row_index (int): Index of the row to update. new_row_data (dict): The new data for the row. Returns: Dataset: Updated dataset. """ updated_rows = dataset.to_dict() for key in new_row_data: updated_rows[key][row_index] = new_row_data[key] updated_dataset = Dataset.from_dict(updated_rows) print(f"Updated row at index {row_index} in the dataset.") return updated_dataset
# View a specific row in the dataset
[docs] def view_row_in_dataset(dataset: Dataset, row_index: int): """ View a specific row in the dataset by index. Args: dataset (Dataset): Hugging Face dataset. row_index (int): Index of the row to view. Returns: dict: Data for the specified row. """ row_data = dataset[row_index] print(f"Row {row_index}: {row_data}") return row_data
# Merge two datasets
[docs] def merge_datasets(dataset1: Dataset, dataset2: Dataset): """ Merge two datasets into one. Args: dataset1 (Dataset): First dataset. dataset2 (Dataset): Second dataset. Returns: Dataset: Merged dataset. """ merged_dataset = concatenate_datasets([dataset1, dataset2]) print("Merged two datasets.") return merged_dataset
# Save the dataset to Hugging Face Hub
[docs] def save_dataset_to_hf(dataset: Dataset, repo_id: str, dataset_name: str, private: bool = True): """ Push a dataset to Hugging Face Hub. Args: dataset (Dataset): The dataset to push to Hugging Face Hub. repo_id (str): The repo ID (namespace/repo) on Hugging Face Hub. dataset_name (str): Name of the dataset on Hugging Face Hub. private (bool): Whether the dataset should be private or public. Returns: None """ dataset.push_to_hub(repo_id, dataset_name, private=private) print(f"Dataset '{dataset_name}' saved to Hugging Face Hub.")
# Filter the dataset based on a condition
[docs] def filter_dataset(dataset: Dataset, filter_fn): """ Filter a dataset based on a custom condition. Args: dataset (Dataset): Hugging Face dataset to filter. filter_fn (function): Function that returns True or False for filtering. Returns: Dataset: Filtered dataset. """ filtered_dataset = dataset.filter(filter_fn) print("Filtered dataset based on condition.") return filtered_dataset
if __name__ == "__main__": # Log in to Hugging Face login_to_huggingface() # Define repo_id, dataset_name, and new_dataset_name repo_id = "SQuADDS/SQuADDS_DB"