Upload a dataset to Zenodo

Install required modules listed in requirements.txt with pip:

$ pip install -r requirements.txt

Create a .env file in folder with following content:

ACCESS_TOKEN=myzenodotoken

Load required module

[25]:
# Import zenodo_helper module
import sys
sys.path.insert(0, '..')
from zenodo_helper import *
import pandas as pd
from dotenv import load_dotenv
ACCESS_TOKEN = os.getenv('ACCESS_TOKEN')
import hashlib

Load dotenv in IPython Notebook

Load geoflow entities table with pandas

[26]:
input = '/home/sylvain/Documents/IRD/DATA4'
file_entities = '20231018-164240_iso19115-metadata.csv'
df = {}
df = pd.read_csv(os.path.join(input, file_entities))
df.head()
[26]:
Identifier Title Description Subject Creator Date Type Language SpatialCoverage TemporalCoverage Relation Rights Provenance Format Data
0 20230524_REU-ermitage_UAV-02_1 title:Images UAV du projet TELEMAC, Ermitage, ... abstract:"This dataset is made of 324 images c... theme[General]:TELEMAC,Réunion,Hermitage,drone... author:sylvain.poulain@ird.fr,pascal.mouquet@i... publication:2023-10-18_\nedition:2023-10-18 dataset fra NaN 2023-05-24 07-35-31 - 2023-05-24 07-58-33 thumbnail:telemac@https://www.osureunion.fr/wp... useLimitation:Utilisation libre sous réserve d... statement:"- Camera model and parameters:\n Ma... resource:image/jpg_\ndistribution:application/... source:SurveyMetadata.gpkg@/home/sylvain/Docum...

Upload to Zenodo

[27]:
### Upload to Zenodo
print("#### Upload zip files to Zenodo")
base_url = "https://zenodo.org/api/"
for zipul in range(len(df)):
    # print("Dataset:", zipul, "/" ,len(df))
    print('\nDataset:', zipul+1, "/" ,len(df), "\n    ",df.iloc[zipul]['Identifier'])
    zenodo_baseurl = base_url

    #### Extract source from Data in dataframe
    data_zip = df.iloc[zipul]['Data'].split('source:')[1].split('_\n')[0]
    if "," in data_zip:
        data_ziptemp = data_zip.split(',')
        data_zip = []
        for dt in range(len(data_ziptemp)):
            data_zip.append(data_ziptemp[dt].split('@')[0])
    else:
        data_zip = [data_zip.split('@')[0]]
    # data_zip = source_file
    print(data_zip)

    ### Put Metadata or verify if doi exists (Metadata not updated if doi exists)
    if 'doi:' in df.iloc[zipul]['Identifier']:
        doi_raw = df.iloc[zipul]['Identifier'].split('_\ndoi:')[1].split('\n')[0].split('.')[-1]
        print("DOI already present: ",doi_raw)
        getrecid = zenlist_single(zenodo_baseurl, ACCESS_TOKEN, str(doi_raw))
        zenval = zenvar(getrecid)
    else:
        print("Initialize deposit")
        r = check_token(zenodo_baseurl, ACCESS_TOKEN)
        zenval = zenvar(r)
        print("prereserved doi:"+zenval[1])
        print("Write DOI to dataframe")
        dfzen = df
        if 'id:' in dfzen.iloc[zipul]['Identifier']:
            pass
        else:
            dfzen.iloc[zipul, dfzen.columns.get_loc('Identifier')] = "id:" + dfzen.iloc[zipul]['Identifier'] + "_\ndoi:" + zenval[1]
            dfzen.iloc[zipul, dfzen.columns.get_loc("Provenance")] = dfzen.iloc[zipul]["Provenance"] + "_\nprocess:Raw dataset uploaded to " + base_url.split('api')[0] + "record/" + str(zenval[2])

        print("Enrich upload with metadata")
        zen_metadata = zenmdt(zenodo_baseurl, ACCESS_TOKEN, zenval[2], df, zipul)
        if zen_metadata.status_code > 400:
            print("error in metadata, please check there is no double keywords: \n" + zen_metadata.text)
            break

    # print(zen_metadata.text)
    print("upload data")
    print("Trying upload number: 1")

    for file in data_zip:
        ul_count = 1
        ### Control Filename
        # while file not in zenlist_single_files(zenodo_baseurl, ACCESS_TOKEN, str(zenval[2])).text:
        ### Control md5 checksum:
        ful = os.path.join(input, df.iloc[zipul]['Identifier'].split('_\n')[0].split(':')[1], file)
        print(ful)
        with open(ful, "rb") as file_to_check:
            # Open,close, read file and calculate MD5 on its contents
            # read contents of the file
            ## Python 3.10
            fh = file_to_check.read()
            # pipe contents of the file through
            digest = hashlib.md5(fh)
            ## End Python 3.10
            # digest = hashlib.file_digest(file_to_check, "md5") ### Python 3.11 only
        print("    md5:", digest.hexdigest())
        while digest.hexdigest() not in zenlist_single_files(zenodo_baseurl, ACCESS_TOKEN, str(zenval[2])).text:
            #Reset connection every 15 tries
            if ul_count == 15:
                getrecid = zenlist_single(zenodo_baseurl, ACCESS_TOKEN, str(doi_raw))
                zenval = zenvar(getrecid)

            file_on_server = zenlist_single_files(zenodo_baseurl, ACCESS_TOKEN, str(zenval[2]))
            ### Clean wrong checksum
            if file in file_on_server.text:
                print("    clean wrong files")
                for fs in range(len(file_on_server.json())):
                    if file == file_on_server.json()[fs]['filename']:
                        # print("    sleep 5: computing checksum on server")
                        # time.sleep(15)
                        print("    checksum on server", file_on_server.json()[fs]['checksum'])
                        furl = file_on_server.json()[fs]["links"]["self"]
                        file_to_remove = zen_del_file(furl, ACCESS_TOKEN)

            file_list = [file]
            zen_upload = zenul(zenval[0], ACCESS_TOKEN, os.path.join(input, df.iloc[zipul]['Identifier'].split('_\n')[0].split(':')[1]), file_list)
            print("        ", zen_upload.text)
            if zen_upload.status_code == 404:
                print("Version doesn't exists ! Please check your record_id")
                break
            else:
                ul_count += 1
                print("    Retry number: " + str(ul_count))
                if zen_upload.status_code == 403:
                    print("        Friday 13th 2023 nightmare => permission denied!")
                    time.sleep(5)

#### Upload zip files to Zenodo

Dataset: 1 / 1
     20230524_REU-ermitage_UAV-02_1
['SurveyMetadata.gpkg']
Initialize deposit
Allowed to deposit some files
prereserved doi:10.5281/zenodo.10072349
Write DOI to dataframe
Enrich upload with metadata
{'metadata': {'title': 'Images UAV du projet TELEMAC, Ermitage, Réunion du 20230524', 'publication_date': '2023-10-18', 'description': '"This dataset is made of 324 images collected by UAV in Ermitage, Réunion, 20230524. 100% of these images present a geolocalization that is inferred thanks to a GPS. More information can be found at 10.5281/zenodo.8362271\n<br />\n<br />Contenu du dépôt:\n<br />└─ 20230524_REU-ermitage_UAV-02_1\n<br />  └─ DCIM\n<br />  └─ GPS\n<br />    └─ base_2023_05_24_pascal\n<br />    └─ reach_2023_05_24_drone\n<br />      └─ reachsylvai_raw_202305240249_RINEX_3_03\n<br />      └─ reachsylvai_raw_202305240330_RINEX_3_03\n<br />    └─ reach_2023_05_24_rover\n<br />  └─ METADATA\n<br />    └─ tb\n<br />  └─ PROCESSED\n<br />\n<br />- Camera model and parameters:\n<br /> Make: Hasselblad\n<br /> Model: L1D-20c\n<br /> Width: 5472\n<br /> Height: 3648\n<br /> Focal: 28\n<br /> WhiteBalance: Manual\n<br /> ExposureMode: Auto Exposure\n<br /> ColoSpace: sRGB\n<br /> EV: -0.7\n<br /> MeteringMode: CenterWeightedAverage\n<br /> Camera Pitch: -80.00\n<br />\n<br />- Survey informations:\n<br /> No Images: 324\n<br /> Median height: 70 meters\n<br /> Survey area: 8.97 hectares\n<br /> Survey from: 2023:05:24 07:35:31 to: 2023:05:24 07:58:33"', 'access_right': 'open', 'notes': '<p></p><div class="ui message warning">This study was funded by the European Regional Development Fund (ERDF) within the programme Interreg V 2014-2020 through the project G2OI</div><br /><img src="https://github.com/IRDG2OI/geoflow-g2oi/raw/main/img/logos_partenaires.png?raw=True">', 'creators': [{'name': 'sylvain.poulain@ird.fr'}, {'name': 'pascal.mouquet@ird.fr'}, {'name': 'emmanuel.cordier@univ-reunion.fr'}], 'keywords': ['TELEMAC', 'Réunion', 'Hermitage', 'drone', 'lagon', 'corail', 'reef'], 'related_identifiers': [{'identifier': 'urn:20230524_REU-ermitage_UAV-02_1', 'relation': 'isIdenticalTo', 'scheme': 'urn'}], 'version': 'v1', 'language': 'fra', 'license': 'cc-by-4.0', 'imprint_publisher': 'Zenodo', 'upload_type': 'dataset'}}
upload data
Trying upload number: 1
/home/sylvain/Documents/IRD/DATA4/20230524_REU-ermitage_UAV-02_1/SurveyMetadata.gpkg
    md5: 9d635f5c3e30a2453e8fc9826fb2d202
    Sleep 5 seconds before new upload
upload: SurveyMetadata.gpkg
         {"created": "2023-11-05T06:34:41.313453+00:00", "updated": "2023-11-05T06:34:42.913152+00:00", "version_id": "f25e784c-d9dc-4d65-bae2-c34ba439555c", "key": "SurveyMetadata.gpkg", "size": 10149888, "mimetype": "application/octet-stream", "checksum": "md5:9d635f5c3e30a2453e8fc9826fb2d202", "is_head": true, "delete_marker": false, "links": {"self": "https://zenodo.org/api/files/84ee1d92-4614-41f2-b614-f329c8e94d92/SurveyMetadata.gpkg", "version": "https://zenodo.org/api/files/84ee1d92-4614-41f2-b614-f329c8e94d92/SurveyMetadata.gpkg?versionId=f25e784c-d9dc-4d65-bae2-c34ba439555c", "uploads": "https://zenodo.org/api/files/84ee1d92-4614-41f2-b614-f329c8e94d92/SurveyMetadata.gpkg?uploads"}}
    Retry number: 2

Display DOI

[31]:
for i in range(len(df)):
    print(df.iloc[i]['Identifier'].replace('_\n', '\n'))
id:20230524_REU-ermitage_UAV-02_1
doi:10.5281/zenodo.10072349
[33]:
### Remove ACCESS_TOKEN
ACCESS_TOKEN = ''