Discovery Environment API Testing Scripts¶
Note: This code is from Day 9 (6/13/24) of the logbook.
Initial Setup¶
import requests
import json
from datetime import datetime
# Base URL for the Discovery Environment API
# This is the main endpoint for accessing the Discovery Environment API, which provides various services
# like metadata retrieval, file management, and more.
base_url = 'https://de.cyverse.org/terrain'
# API Key (replace with your actual API key)
# The API key is used for authentication when making requests to the Discovery Environment API.
# Replace 'YOUR_API_KEY' with your actual API key to access the API services.
api_key = 'Bearer " + 'YOUR_API_KEY'
# Headers for the requests
# These headers include the API key and are required for making authorized requests to the API.
headers = {
'Authorization': api_key
}
---
## Print JSON Data in a Readable Format
```python
def pretty_print(json_data):
"""
Format and print JSON data in a readable way.
This function takes JSON data and prints it in a formatted, easy-to-read manner.
Useful for debugging and ensuring the correct data structure is being used.
Args:
json_data (dict): JSON data to be pretty-printed.
"""
print(json.dumps(json_data, indent=4, sort_keys=True))
Convert Milliseconds since Epoch to Date¶
def convert_to_date(milliseconds):
"""
Convert milliseconds since epoch to a human-readable date and time.
This function converts a timestamp in milliseconds to a human-readable date and time string.
This is used to convert the milliseconds since epoch of the date created and date updated fields to a more readable format
Args:
milliseconds (int): Milliseconds since epoch.
Returns:
str: Human-readable date and time.
"""
seconds = milliseconds / 1000 # Convert milliseconds to seconds
date_obj = datetime.fromtimestamp(seconds) # Create a datetime object from the timestamp
date_str = date_obj.strftime('%Y-%m-%d %H:%M:%S') # Format the datetime object as a string
return date_str
Get the metadata for a specific data id¶
def get_metadata(data_id):
"""
Get metadata for a specific data ID.
This function retrieves metadata for a specified dataset by its ID.
It sends a GET request to the Discovery Environment API.
Args:
data_id (str): The ID of the data item.
Returns:
dict: The metadata for the specified data ID.
"""
url = f'{base_url}/filesystem/{data_id}/metadata' # Construct the API URL for the metadata endpoint
response = requests.get(url, headers=headers) # Send a GET request to the API
if response.status_code == 200:
metadata = response.json() # Parse the JSON response
return metadata
else:
print(f"Error getting metadata: {response.status_code} - {response.text}") # Print error message if the request fails
return None
Get all metadata for a dataset¶
def get_all_metadata_dataset(dataset):
"""
Get all metadata for a dataset.
This function collects all metadata for a given dataset, including creation and modification dates,
and detailed attributes from the Discovery Environment API. Used to migrate the datasets and their metadata
to CKAN.
Args:
dataset (dict): The dataset dictionary.
Returns:
dict: A dictionary containing all metadata for the dataset.
"""
metadata_dict = {}
# Convert and store creation and modification dates
date_created = convert_to_date(int(dataset['date-created'])) # Convert creation date to readable format
metadata_dict['date_created'] = date_created
date_modified = convert_to_date(int(dataset['date-modified'])) # Convert modification date to readable format
metadata_dict['date_modified'] = date_modified
metadata_dict['de_path'] = dataset['path'] # Store the dataset path
dataset_id = dataset['id'] # Get the dataset ID
# Get detailed metadata from the API
metadata_return = get_metadata(dataset_id)
avus = metadata_return['avus'] # Get attribute-value units (AVUs)
# Loop through each AVU and add it to the metadata dictionary
for avu in avus:
key = avu['attr']
value = avu['value']
if key in metadata_dict:
try:
metadata_dict[key].append(value)
except AttributeError:
metadata_dict[key] = [metadata_dict[key], value]
else:
metadata_dict[key] = value
return metadata_dict
Get all metadata for a file¶
def get_all_metadata_file(file):
"""
Get metadata for a specific file.
This function collects all metadata for a given file, including creation and modification dates,
file type, and WebDAV location. Used for migrating files to CKAN.
Args:
file (dict): The file dictionary.
Returns:
dict: A dictionary containing all metadata for the file.
"""
metadata_dict = {}
# Convert and store creation and modification dates
date_created = convert_to_date(int(file['date-created'])) # Convert creation date to readable format
metadata_dict['date_created'] = date_created
date_modified = convert_to_date(int(file['date-modified'])) # Convert modification date to readable format
metadata_dict['date_modified'] = date_modified
metadata_dict['de_path'] = file['path'] # Store the file path
file_name = file['label'] # Get the file name
metadata_dict['file_name'] = file_name
# Get the file type from the label
file_type = file_name.split('.')[-1]
if file_type == file_name:
file_type = ''
metadata_dict['file_type'] = file_type
# Construct the WebDAV location URL
web_dav_location = "https://data.cyverse.org/dav-anon/iplant/commons/cyverse_curated/" + file['path'].replace('/iplant/home/shared/commons_repo/curated/', '')
metadata_dict['web_dav_location'] = web_dav_location
return metadata_dict
Get the list of files/folders in a specified directory¶
def get_files(path, limit=10):
"""
Get the list of files and folders in a specified directory.
This function retrieves a list of files and folders in a specified directory from the Discovery Environment API.
Useful for migrating files/foldersfrom a directory to CKAN.
Args:
path (str): The path to the directory.
limit (int): The maximum number of files to retrieve.
Returns:
dict: A dictionary containing the list of files.
"""
url = f'{base_url}/secured/filesystem/paged-directory' # Construct the API URL for the directory endpoint
params = {'limit': limit, 'path': path} # Set the request parameters
response = requests.get(url, headers=headers, params=params) # Send a GET request to the API
if response.status_code == 200:
files = response.json() # Parse the JSON response
return files
else:
print(f"Error getting files: {response.status_code} - {response.text}") # Print error message if the request fails
return None
Get a list of all datasets with some of their metadata¶
def get_datasets():
"""
Get a list of all datasets with some of their metadata. The rest of the metadata can be retrieved using the get_metadata function.
This function retrieves a list of all datasets in a specified path from the Discovery Environment API.
Used in conjunction with get_all_metadata_dataset function to migrate datasets and their metadata
to CKAN.
Args:
None
Returns:
list: A list of dictionaries, each representing a dataset with its metadata.
"""
path = '/iplant/home/shared/commons_repo/curated/' # Define the path to the datasets
url = f'{base_url}/secured/filesystem/directory' # Construct the API URL for the directory endpoint
params = {'path': path} # Set the request parameters
response = requests.get(url, headers=headers, params=params) # Send a GET request to the API
if response.status_code == 200:
directories = response.json() # Parse the JSON response
datasets = directories['folders'] # Extract the list of datasets
return datasets
else:
print(f"Error getting directories: {response.status_code} - {response.text}") # Print error message if the request fails
return None