Skip to main content

Import files with the API

The Nuclia API is a great way to import data into Nuclia.

Prerequisites

Get a contributor API key as detailed here.

Using the Python SDK

Here is a typical Python script collecting all the files from a folder tree and pushing them to Nuclia:

import os
import sys
from nuclia import sdk

IGNORE = [
".DS_Store",
"Thumbs.db",
]

KNOWLEDGE_BOX = "https://<zone>.nuclia.cloud/api/v1/kb/<your-knowledge-box-id>"
API_KEY = "<your-api-key-with-contributor-access>"

sdk.NucliaAuth().kb(url=KNOWLEDGE_BOX, token=API_KEY)

def upload_folder(path):
all_files = os.listdir(path)
for content in all_files:
if content in IGNORE or content.startswith("."):
continue
content_path = os.path.join(path, content)
if os.path.isdir(content_path):
upload_folder(content_path)
else:
sdk.NucliaUpload().file(path=content_path)

if __name__ == "__main__":
root = sys.argv[1]
upload_folder(root)

Before running the script, make sure you are using Python 3:

python --version

Install the nuclia package:

pip install nuclia

Then:

python3 import.py /path/to/folder

Using direct calls to the API endpoints

Using the Nuclia Python SDK is not mandatory, you can do HTTP requests directly to the API endpoints.

Here is how it would be done in Python, but it can be adapted to other languages:

import os
import requests
import sys
import mimetypes
import urllib3

urllib3.disable_warnings()

IGNORE = [
".DS_Store",
"Thumbs.db",
]

BACKEND = "https://<zone>.nuclia.cloud/api/v1"
KNOWLEDGE_BOX = "/kb/<your-knowledge-box-id>"
API_KEY = "<your-api-key-with-contributor-access>"

def upload_file(content_path):
file_name = os.path.basename(content_path).encode('ascii')
file_upload_path = f'{BACKEND}{KNOWLEDGE_BOX}/upload'
print(f'Importing {content_path} at {file_upload_path}')

with open(content_path, "rb") as source_file:
response = requests.post(
file_upload_path,
headers={
"content-type": mimetypes.guess_type(content_path)[0] or "application/octet-stream",
"x-filename": file_name,
"X-NUCLIA-SERVICEACCOUNT": "Bearer " + API_KEY,
"x-synchronous": "true",
},
data=source_file.read(),
verify=False,
)
if response.status_code != 201:
print(f'Error {response.status_code} importing {file_name}')

def upload_folder(path):
all_files = os.listdir(path)
for content in all_files:
if content in IGNORE or content.startswith("."):
continue
content_path = os.path.join(path, content)
if os.path.isdir(content_path):
upload_folder(content_path)
else:
upload_file(content_path)

if __name__ == "__main__":
root = sys.argv[1]
upload_folder(root)

Set labels on imported files

If you want to set labels on the files you are importing, you will first have to create a resource with the expected metadata and then upload the file in a file field.

import os
import sys
from nuclia import sdk

IGNORE = [
".DS_Store",
"Thumbs.db",
]

KNOWLEDGE_BOX = "https://<zone>.nuclia.cloud/api/v1/kb/<your-knowledge-box-id>"
API_KEY = "<your-api-key-with-contributor-access>"

sdk.NucliaAuth().kb(url=KNOWLEDGE_BOX, token=API_KEY)

def upload_file(content_path, labelset, label):
file_name = os.path.basename(content_path).encode('ascii')
print(f'Importing {content_path}')

with open(content_path, "rb") as source_file:
rid = sdk.NucliaResource().create(
title=file_name,
usermetadata={
"classifications": [{"labelset": labelset, "label": label}]
},
)
sdk.NucliaUpload().file(rid=rid, path=content_path, field="file")

def upload_folder(path, labelset, label):
all_files = os.listdir(path)
for content in all_files:
if content in IGNORE or content.startswith("."):
continue
content_path = os.path.join(path, content)
if os.path.isdir(content_path):
upload_folder(content_path, labelset, label)
else:
upload_file(content_path, labelset, label)

if __name__ == "__main__":
root = sys.argv[1]
labelset = sys.argv[2]
label = sys.argv[3]
upload_folder(root, labelset, label)

You will then pass the label set and label as arguments:

python3 import.py /path/to/folder "documents" "Purchase Order"

Note: The label set and label must already exist in Nuclia. Here "documents" is the label set id (which is derived from the label set name) and "Purchase Order" is the actual label value.