Upload a dataset to Zenodo
Install required modules listed in requirements.txt with pip:
$ pip install -r requirements.txt
Create a .env file in folder with following content:
ACCESS_TOKEN=myzenodotoken
Load required module
[25]:
# Import zenodo_helper module
import sys
sys.path.insert(0, '..')
from zenodo_helper import *
import pandas as pd
from dotenv import load_dotenv
ACCESS_TOKEN = os.getenv('ACCESS_TOKEN')
import hashlib
Load dotenv in IPython Notebook
Load geoflow entities table with pandas
[26]:
input = '/home/sylvain/Documents/IRD/DATA4'
file_entities = '20231018-164240_iso19115-metadata.csv'
df = {}
df = pd.read_csv(os.path.join(input, file_entities))
df.head()
[26]:
| Identifier | Title | Description | Subject | Creator | Date | Type | Language | SpatialCoverage | TemporalCoverage | Relation | Rights | Provenance | Format | Data | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 20230524_REU-ermitage_UAV-02_1 | title:Images UAV du projet TELEMAC, Ermitage, ... | abstract:"This dataset is made of 324 images c... | theme[General]:TELEMAC,Réunion,Hermitage,drone... | author:sylvain.poulain@ird.fr,pascal.mouquet@i... | publication:2023-10-18_\nedition:2023-10-18 | dataset | fra | NaN | 2023-05-24 07-35-31 - 2023-05-24 07-58-33 | thumbnail:telemac@https://www.osureunion.fr/wp... | useLimitation:Utilisation libre sous réserve d... | statement:"- Camera model and parameters:\n Ma... | resource:image/jpg_\ndistribution:application/... | source:SurveyMetadata.gpkg@/home/sylvain/Docum... |
Upload to Zenodo
[27]:
### Upload to Zenodo
print("#### Upload zip files to Zenodo")
base_url = "https://zenodo.org/api/"
for zipul in range(len(df)):
# print("Dataset:", zipul, "/" ,len(df))
print('\nDataset:', zipul+1, "/" ,len(df), "\n ",df.iloc[zipul]['Identifier'])
zenodo_baseurl = base_url
#### Extract source from Data in dataframe
data_zip = df.iloc[zipul]['Data'].split('source:')[1].split('_\n')[0]
if "," in data_zip:
data_ziptemp = data_zip.split(',')
data_zip = []
for dt in range(len(data_ziptemp)):
data_zip.append(data_ziptemp[dt].split('@')[0])
else:
data_zip = [data_zip.split('@')[0]]
# data_zip = source_file
print(data_zip)
### Put Metadata or verify if doi exists (Metadata not updated if doi exists)
if 'doi:' in df.iloc[zipul]['Identifier']:
doi_raw = df.iloc[zipul]['Identifier'].split('_\ndoi:')[1].split('\n')[0].split('.')[-1]
print("DOI already present: ",doi_raw)
getrecid = zenlist_single(zenodo_baseurl, ACCESS_TOKEN, str(doi_raw))
zenval = zenvar(getrecid)
else:
print("Initialize deposit")
r = check_token(zenodo_baseurl, ACCESS_TOKEN)
zenval = zenvar(r)
print("prereserved doi:"+zenval[1])
print("Write DOI to dataframe")
dfzen = df
if 'id:' in dfzen.iloc[zipul]['Identifier']:
pass
else:
dfzen.iloc[zipul, dfzen.columns.get_loc('Identifier')] = "id:" + dfzen.iloc[zipul]['Identifier'] + "_\ndoi:" + zenval[1]
dfzen.iloc[zipul, dfzen.columns.get_loc("Provenance")] = dfzen.iloc[zipul]["Provenance"] + "_\nprocess:Raw dataset uploaded to " + base_url.split('api')[0] + "record/" + str(zenval[2])
print("Enrich upload with metadata")
zen_metadata = zenmdt(zenodo_baseurl, ACCESS_TOKEN, zenval[2], df, zipul)
if zen_metadata.status_code > 400:
print("error in metadata, please check there is no double keywords: \n" + zen_metadata.text)
break
# print(zen_metadata.text)
print("upload data")
print("Trying upload number: 1")
for file in data_zip:
ul_count = 1
### Control Filename
# while file not in zenlist_single_files(zenodo_baseurl, ACCESS_TOKEN, str(zenval[2])).text:
### Control md5 checksum:
ful = os.path.join(input, df.iloc[zipul]['Identifier'].split('_\n')[0].split(':')[1], file)
print(ful)
with open(ful, "rb") as file_to_check:
# Open,close, read file and calculate MD5 on its contents
# read contents of the file
## Python 3.10
fh = file_to_check.read()
# pipe contents of the file through
digest = hashlib.md5(fh)
## End Python 3.10
# digest = hashlib.file_digest(file_to_check, "md5") ### Python 3.11 only
print(" md5:", digest.hexdigest())
while digest.hexdigest() not in zenlist_single_files(zenodo_baseurl, ACCESS_TOKEN, str(zenval[2])).text:
#Reset connection every 15 tries
if ul_count == 15:
getrecid = zenlist_single(zenodo_baseurl, ACCESS_TOKEN, str(doi_raw))
zenval = zenvar(getrecid)
file_on_server = zenlist_single_files(zenodo_baseurl, ACCESS_TOKEN, str(zenval[2]))
### Clean wrong checksum
if file in file_on_server.text:
print(" clean wrong files")
for fs in range(len(file_on_server.json())):
if file == file_on_server.json()[fs]['filename']:
# print(" sleep 5: computing checksum on server")
# time.sleep(15)
print(" checksum on server", file_on_server.json()[fs]['checksum'])
furl = file_on_server.json()[fs]["links"]["self"]
file_to_remove = zen_del_file(furl, ACCESS_TOKEN)
file_list = [file]
zen_upload = zenul(zenval[0], ACCESS_TOKEN, os.path.join(input, df.iloc[zipul]['Identifier'].split('_\n')[0].split(':')[1]), file_list)
print(" ", zen_upload.text)
if zen_upload.status_code == 404:
print("Version doesn't exists ! Please check your record_id")
break
else:
ul_count += 1
print(" Retry number: " + str(ul_count))
if zen_upload.status_code == 403:
print(" Friday 13th 2023 nightmare => permission denied!")
time.sleep(5)
#### Upload zip files to Zenodo
Dataset: 1 / 1
20230524_REU-ermitage_UAV-02_1
['SurveyMetadata.gpkg']
Initialize deposit
Allowed to deposit some files
prereserved doi:10.5281/zenodo.10072349
Write DOI to dataframe
Enrich upload with metadata
{'metadata': {'title': 'Images UAV du projet TELEMAC, Ermitage, Réunion du 20230524', 'publication_date': '2023-10-18', 'description': '"This dataset is made of 324 images collected by UAV in Ermitage, Réunion, 20230524. 100% of these images present a geolocalization that is inferred thanks to a GPS. More information can be found at 10.5281/zenodo.8362271\n<br />\n<br />Contenu du dépôt:\n<br />└─ 20230524_REU-ermitage_UAV-02_1\n<br /> └─ DCIM\n<br /> └─ GPS\n<br /> └─ base_2023_05_24_pascal\n<br /> └─ reach_2023_05_24_drone\n<br /> └─ reachsylvai_raw_202305240249_RINEX_3_03\n<br /> └─ reachsylvai_raw_202305240330_RINEX_3_03\n<br /> └─ reach_2023_05_24_rover\n<br /> └─ METADATA\n<br /> └─ tb\n<br /> └─ PROCESSED\n<br />\n<br />- Camera model and parameters:\n<br /> Make: Hasselblad\n<br /> Model: L1D-20c\n<br /> Width: 5472\n<br /> Height: 3648\n<br /> Focal: 28\n<br /> WhiteBalance: Manual\n<br /> ExposureMode: Auto Exposure\n<br /> ColoSpace: sRGB\n<br /> EV: -0.7\n<br /> MeteringMode: CenterWeightedAverage\n<br /> Camera Pitch: -80.00\n<br />\n<br />- Survey informations:\n<br /> No Images: 324\n<br /> Median height: 70 meters\n<br /> Survey area: 8.97 hectares\n<br /> Survey from: 2023:05:24 07:35:31 to: 2023:05:24 07:58:33"', 'access_right': 'open', 'notes': '<p></p><div class="ui message warning">This study was funded by the European Regional Development Fund (ERDF) within the programme Interreg V 2014-2020 through the project G2OI</div><br /><img src="https://github.com/IRDG2OI/geoflow-g2oi/raw/main/img/logos_partenaires.png?raw=True">', 'creators': [{'name': 'sylvain.poulain@ird.fr'}, {'name': 'pascal.mouquet@ird.fr'}, {'name': 'emmanuel.cordier@univ-reunion.fr'}], 'keywords': ['TELEMAC', 'Réunion', 'Hermitage', 'drone', 'lagon', 'corail', 'reef'], 'related_identifiers': [{'identifier': 'urn:20230524_REU-ermitage_UAV-02_1', 'relation': 'isIdenticalTo', 'scheme': 'urn'}], 'version': 'v1', 'language': 'fra', 'license': 'cc-by-4.0', 'imprint_publisher': 'Zenodo', 'upload_type': 'dataset'}}
upload data
Trying upload number: 1
/home/sylvain/Documents/IRD/DATA4/20230524_REU-ermitage_UAV-02_1/SurveyMetadata.gpkg
md5: 9d635f5c3e30a2453e8fc9826fb2d202
Sleep 5 seconds before new upload
upload: SurveyMetadata.gpkg
{"created": "2023-11-05T06:34:41.313453+00:00", "updated": "2023-11-05T06:34:42.913152+00:00", "version_id": "f25e784c-d9dc-4d65-bae2-c34ba439555c", "key": "SurveyMetadata.gpkg", "size": 10149888, "mimetype": "application/octet-stream", "checksum": "md5:9d635f5c3e30a2453e8fc9826fb2d202", "is_head": true, "delete_marker": false, "links": {"self": "https://zenodo.org/api/files/84ee1d92-4614-41f2-b614-f329c8e94d92/SurveyMetadata.gpkg", "version": "https://zenodo.org/api/files/84ee1d92-4614-41f2-b614-f329c8e94d92/SurveyMetadata.gpkg?versionId=f25e784c-d9dc-4d65-bae2-c34ba439555c", "uploads": "https://zenodo.org/api/files/84ee1d92-4614-41f2-b614-f329c8e94d92/SurveyMetadata.gpkg?uploads"}}
Retry number: 2
Display DOI
[31]:
for i in range(len(df)):
print(df.iloc[i]['Identifier'].replace('_\n', '\n'))
id:20230524_REU-ermitage_UAV-02_1
doi:10.5281/zenodo.10072349
[33]:
### Remove ACCESS_TOKEN
ACCESS_TOKEN = ''