Audio Redaction Batch Job with Cloudmersive DLP API and Python

Knowledge Base

Find answers to common questions about Cloudmersive products and services.

5/26/2026 - Cloudmersive Support

In this article, we will perform audio redaction on a large (60+ minute) input audio file using the Cloudmersive DLP API and the Batch Job pattern using the Cloudmersive Python SDK for the DLP API.

First, we need to install the Cloudmersive Python DLP API SDK:

pip install cloudmersive-dlp-api-client

Then we should create dlp.py with the following script:

import base64
import json
import time
from pathlib import Path

import cloudmersive_dlp_api_client
from cloudmersive_dlp_api_client.rest import ApiException


# ---
# Configuration variables
# ---

API_BASE_PATH = "https://YOUR-CLOUDMERSIVE-ENDPOINT-BASE-PATH"   # Or your Managed Instance / Private Cloud base URL
API_KEY = "YOUR_API_KEY_HERE"

INPUT_FILE_PATH = r"/content/input.mp3"
OUTPUT_WAV_PATH = r"/content/redacted-output.wav"
OUTPUT_JSON_PATH = r"/content/redaction-result.json"

POLL_INTERVAL_SECONDS = 10
POLL_TIMEOUT_SECONDS = 4 * 60 * 60

LANGUAGE_CODE = "ENG"
REDACTION_MODE = "Bleep"  # "Bleep" or "Mute"
TRANSCRIPT_REDACTION_MODE = "SemanticTag"  # "SemanticTag", "Delete", or "ReplaceWithAsterisk"
SPEECH_RECOGNITION_MODE = "Normal"  # "Fast", "Normal", or "Advanced"
PROVIDE_ANALYSIS_RATIONALE = True


def write_json(path: str, data: dict) -> None:
    output_path = Path(path)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    with output_path.open("w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)


def write_base64_file(path: str, base64_data: str) -> None:
    output_path = Path(path)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    output_path.write_bytes(base64.b64decode(base64_data))


def make_api_client() -> cloudmersive_dlp_api_client.ApiClient:
    configuration = cloudmersive_dlp_api_client.Configuration()
    configuration.host = API_BASE_PATH.rstrip("/")
    configuration.api_key["Apikey"] = API_KEY

    return cloudmersive_dlp_api_client.ApiClient(configuration)


def submit_redaction_job(batch_api, input_file_path: str) -> str:
    input_path = Path(input_file_path)

    if not input_path.exists():
        raise FileNotFoundError(f"Input file does not exist: {input_path}")

    input_file_base64 = base64.b64encode(input_path.read_bytes()).decode("utf-8")

    request = cloudmersive_dlp_api_client.DlpAdvancedAudioRedactionRequest(
        input_file=input_file_base64,
        language_code=LANGUAGE_CODE,

        # False means "do redact this category"; True means "allow it / do not redact it".
        allow_email_address=False,
        allow_phone_number=False,
        allow_street_address=False,
        allow_city=False,
        allow_person_name=False,
        allow_birth_date=False,
        allow_passport_number=False,
        allow_drivers_license=False,
        allow_social_security_number=False,
        allow_taxpayer_id=False,
        allow_credit_card_number=False,
        allow_credit_card_expiration_date=False,
        allow_credit_card_verification_code=False,
        allow_bank_account_number=False,
        allow_iban=False,
        allow_health_insurance_number=False,
        allow_bearer_token=False,
        allow_http_cookie=False,
        allow_private_keys=False,
        allow_credentials=False,
        allow_deep_web_urls=False,
        allow_source_code=False,
        allow_ip_address=False,
        allow_mac_address=False,
        allow_health_insurance_member_id=False,
        allow_medical_record_number=False,
        allow_billing_account_number=False,
        allow_health_injury_or_disease=False,
        allow_health_type_of_treatment=False,
        allow_health_date_and_time_of_treatment=False,
        allow_health_plan_beneficiary_number=False,
        allow_health_payments_made_for_treatment=False,
        allow_vehicle_id=False,
        allow_device_id=False,
        allow_names_of_relatives=False,
        allow_health_universal_record_locator=False,
        allow_biometrics=False,

        redaction_mode=REDACTION_MODE,
        transcript_redaction_mode=TRANSCRIPT_REDACTION_MODE,
        provide_analysis_rationale=PROVIDE_ANALYSIS_RATIONALE,
        speech_recognition_mode=SPEECH_RECOGNITION_MODE,
    )

    job_result = batch_api.redact_audio_advanced_batch_job(body=request)

    if not job_result.successful:
        raise RuntimeError(f"Failed to submit batch job: {job_result}")

    if not job_result.async_job_id:
        raise RuntimeError(f"Batch job response did not include async_job_id: {job_result}")

    return job_result.async_job_id


def poll_until_complete(batch_api, async_job_id: str):
    start_time = time.time()

    while True:
        status = batch_api.get_async_job_status(async_job_id=async_job_id)

        if not status.successful:
            raise RuntimeError(f"Failed to get batch job status: {status.error_message or status}")

        print(f"Job {async_job_id} status: {status.async_job_status}")

        if status.async_job_status == "COMPLETED":
            if status.redact_audio_advanced_result is None:
                raise RuntimeError(f"Job completed but redact_audio_advanced_result was empty: {status}")

            return status

        if status.error_message:
            raise RuntimeError(f"Batch job returned error: {status.error_message}")

        if time.time() - start_time > POLL_TIMEOUT_SECONDS:
            raise TimeoutError(f"Timed out waiting for batch job {async_job_id}")

        time.sleep(POLL_INTERVAL_SECONDS)


def main() -> None:
    api_client = make_api_client()
    batch_api = cloudmersive_dlp_api_client.TasksBatchJobApi(api_client)

    try:
        async_job_id = submit_redaction_job(batch_api, INPUT_FILE_PATH)
        print(f"Submitted batch job. AsyncJobID: {async_job_id}")

        final_status = poll_until_complete(batch_api, async_job_id)
        redaction_result = final_status.redact_audio_advanced_result

        if not redaction_result.redacted_audio:
            raise RuntimeError("Completed job did not include redacted_audio.")

        write_base64_file(OUTPUT_WAV_PATH, redaction_result.redacted_audio)

        status_json = final_status.to_dict()

        # Avoid duplicating the large audio payload in the JSON output file.
        if (
            status_json.get("redact_audio_advanced_result")
            and "redacted_audio" in status_json["redact_audio_advanced_result"]
        ):
            status_json["redact_audio_advanced_result"].pop("redacted_audio", None)

        status_json["redacted_audio_output_file"] = str(Path(OUTPUT_WAV_PATH).resolve())

        write_json(OUTPUT_JSON_PATH, status_json)

        print(f"Redacted WAV saved to: {OUTPUT_WAV_PATH}")
        print(f"Result JSON saved to: {OUTPUT_JSON_PATH}")
        print(f"CleanResult: {redaction_result.clean_result}")
        print(f"RedactedTranscript: {redaction_result.redacted_transcript}")

    except ApiException as e:
        print("Cloudmersive API error")
        print(f"Status: {getattr(e, 'status', None)}")
        print(f"Reason: {getattr(e, 'reason', None)}")
        print(f"Body: {getattr(e, 'body', None)}")
        raise


if __name__ == "__main__":
    main()

Cloudmersive Cloud DLP AI API v1-11-0 Released - Cloudmersive APIs

How to Redact Data Leaks in Text using Python - Cloudmersive APIs

Cloudmersive Cloud DLP AI API v1-7-0 Released - Cloudmersive APIs

Cloudmersive Cloud Speech API v3-3-0 Released - Cloudmersive APIs

Cloudmersive Cloud DLP AI API v2-7-4 Released - Cloudmersive APIs