Knowledge Base

Find answers to common questions about Cloudmersive products and services.



Intelligent Document Field Extraction with Python and Cloudmersive Document AI API
2/12/2026 - Cloudmersive Support


In this sample we will use the Cloudmersive Document AI API to intelligently extract all fields froman input document and save the results to JSON in the same folder.

Prior to running the script, install the SDK:

pip install cloudmersive-documentai-api-client requests

Then configure the variables INPUT_FILE_PATH, API_BASE_PATH, and API_KEY appropriately.

"""
Cloudmersive Document AI - Advanced Extract All Fields
- Prints results to console
- Saves JSON output alongside the input file (same name, .json)
- Shows status/spinner while waiting

Install:
  pip install cloudmersive-documentai-api-client
"""

import base64
import json
import os
import sys
import threading
import time
from pprint import pprint

import cloudmersive_documentai_api_client
from cloudmersive_documentai_api_client.rest import ApiException


# =========================
# USER CONFIG (edit these)
# =========================

# Example: "https://api.cloudmersive.com" (default Cloudmersive public cloud)
# Or your Private Cloud endpoint base URL (e.g., "https://your-privatecloud.example.com")
API_BASE_PATH = "https://api.cloudmersive.com"

# Your Cloudmersive API key
API_KEY = "YOUR-APPI-KEY"

# Full local path to the document you want to process
INPUT_FILE_PATH = r"C:\Users\input.pdf"

# Optional parameters
RECOGNITION_MODE = "Advanced"   # "Advanced" (default/high accuracy) or "Normal"
PREPROCESSING = "Auto"          # "Auto" (default), "Paged", or "Compatability"


def _spinner(stop_event: threading.Event, prefix: str = "Processing"):
    frames = ["|", "/", "-", "\\"]
    i = 0
    while not stop_event.is_set():
        sys.stdout.write(f"\r{prefix}... {frames[i % len(frames)]}")
        sys.stdout.flush()
        i += 1
        time.sleep(0.15)
    sys.stdout.write("\r" + " " * 60 + "\r")
    sys.stdout.flush()


def main():
    if not os.path.isfile(INPUT_FILE_PATH):
        raise FileNotFoundError(f"Input file not found: {INPUT_FILE_PATH}")

    output_json_path = os.path.splitext(INPUT_FILE_PATH)[0] + ".json"

    print(f"Input file:  {INPUT_FILE_PATH}")
    print(f"Output JSON: {output_json_path}")
    print(f"API base:    {API_BASE_PATH}")
    print(f"Mode:        {RECOGNITION_MODE}")
    print(f"Preprocess:  {PREPROCESSING}")
    print("Preparing request...")

    configuration = cloudmersive_documentai_api_client.Configuration()
    configuration.api_key["Apikey"] = API_KEY
    configuration.host = API_BASE_PATH

    api_client = cloudmersive_documentai_api_client.ApiClient(configuration)
    api_instance = cloudmersive_documentai_api_client.ExtractApi(api_client)

    stop_event = threading.Event()
    spinner_thread = threading.Thread(
        target=_spinner,
        args=(stop_event, "Waiting for Document AI result"),
        daemon=True
    )

    result_holder = {"resp": None, "err": None}

    def _worker():
        try:
            # IMPORTANT: pass the FILE PATH (string), not an open file handle
            resp = api_instance.extract_all_fields_and_tables(
                recognition_mode=RECOGNITION_MODE,
                preprocessing=PREPROCESSING,
                input_file=INPUT_FILE_PATH
            )
            result_holder["resp"] = resp
        except Exception as e:
            result_holder["err"] = e

    worker_thread = threading.Thread(target=_worker, daemon=True)

    print("Calling Document AI (Extract All Fields and Tables)...")
    spinner_thread.start()
    worker_thread.start()
    worker_thread.join()
    stop_event.set()
    spinner_thread.join()

    if result_holder["err"] is not None:
        err = result_holder["err"]
        if isinstance(err, ApiException):
            print("\nAPI Exception:")
            print(f"Status:  {err.status}")
            print(f"Reason:  {err.reason}")
            print(f"Body:\n{err.body}")
        else:
            print("\nError:")
            print(repr(err))
        sys.exit(1)

    resp = result_holder["resp"]
    resp_json_obj = api_client.sanitize_for_serialization(resp)

    print("\n=== Document AI Result (JSON) ===")
    print(json.dumps(resp_json_obj, indent=2, ensure_ascii=False))

    with open(output_json_path, "w", encoding="utf-8") as f:
        json.dump(resp_json_obj, f, indent=2, ensure_ascii=False)

    print(f"\nSaved JSON to: {output_json_path}")


if __name__ == "__main__":
    main()

600 free API calls/month, with no expiration

Sign Up Now or Sign in with Google    Sign in with Microsoft

Questions? We'll be your guide.

Contact Sales