In this article, we will perform audio redaction on a large (60+ minute) input audio file using the Cloudmersive DLP API and the Batch Job pattern using the Cloudmersive Python SDK for the DLP API.
First, we need to install the Cloudmersive Python DLP API SDK:
pip install cloudmersive-dlp-api-client
Then we should create dlp.py with the following script:
import base64
import json
import time
from pathlib import Path
import cloudmersive_dlp_api_client
from cloudmersive_dlp_api_client.rest import ApiException
# ---
# Configuration variables
# ---
API_BASE_PATH = "https://YOUR-CLOUDMERSIVE-ENDPOINT-BASE-PATH" # Or your Managed Instance / Private Cloud base URL
API_KEY = "YOUR_API_KEY_HERE"
INPUT_FILE_PATH = r"/content/input.mp3"
OUTPUT_WAV_PATH = r"/content/redacted-output.wav"
OUTPUT_JSON_PATH = r"/content/redaction-result.json"
POLL_INTERVAL_SECONDS = 10
POLL_TIMEOUT_SECONDS = 4 * 60 * 60
LANGUAGE_CODE = "ENG"
REDACTION_MODE = "Bleep" # "Bleep" or "Mute"
TRANSCRIPT_REDACTION_MODE = "SemanticTag" # "SemanticTag", "Delete", or "ReplaceWithAsterisk"
SPEECH_RECOGNITION_MODE = "Normal" # "Fast", "Normal", or "Advanced"
PROVIDE_ANALYSIS_RATIONALE = True
def write_json(path: str, data: dict) -> None:
output_path = Path(path)
output_path.parent.mkdir(parents=True, exist_ok=True)
with output_path.open("w", encoding="utf-8") as f:
json.dump(data, f, indent=2)
def write_base64_file(path: str, base64_data: str) -> None:
output_path = Path(path)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_bytes(base64.b64decode(base64_data))
def make_api_client() -> cloudmersive_dlp_api_client.ApiClient:
configuration = cloudmersive_dlp_api_client.Configuration()
configuration.host = API_BASE_PATH.rstrip("/")
configuration.api_key["Apikey"] = API_KEY
return cloudmersive_dlp_api_client.ApiClient(configuration)
def submit_redaction_job(batch_api, input_file_path: str) -> str:
input_path = Path(input_file_path)
if not input_path.exists():
raise FileNotFoundError(f"Input file does not exist: {input_path}")
input_file_base64 = base64.b64encode(input_path.read_bytes()).decode("utf-8")
request = cloudmersive_dlp_api_client.DlpAdvancedAudioRedactionRequest(
input_file=input_file_base64,
language_code=LANGUAGE_CODE,
# False means "do redact this category"; True means "allow it / do not redact it".
allow_email_address=False,
allow_phone_number=False,
allow_street_address=False,
allow_city=False,
allow_person_name=False,
allow_birth_date=False,
allow_passport_number=False,
allow_drivers_license=False,
allow_social_security_number=False,
allow_taxpayer_id=False,
allow_credit_card_number=False,
allow_credit_card_expiration_date=False,
allow_credit_card_verification_code=False,
allow_bank_account_number=False,
allow_iban=False,
allow_health_insurance_number=False,
allow_bearer_token=False,
allow_http_cookie=False,
allow_private_keys=False,
allow_credentials=False,
allow_deep_web_urls=False,
allow_source_code=False,
allow_ip_address=False,
allow_mac_address=False,
allow_health_insurance_member_id=False,
allow_medical_record_number=False,
allow_billing_account_number=False,
allow_health_injury_or_disease=False,
allow_health_type_of_treatment=False,
allow_health_date_and_time_of_treatment=False,
allow_health_plan_beneficiary_number=False,
allow_health_payments_made_for_treatment=False,
allow_vehicle_id=False,
allow_device_id=False,
allow_names_of_relatives=False,
allow_health_universal_record_locator=False,
allow_biometrics=False,
redaction_mode=REDACTION_MODE,
transcript_redaction_mode=TRANSCRIPT_REDACTION_MODE,
provide_analysis_rationale=PROVIDE_ANALYSIS_RATIONALE,
speech_recognition_mode=SPEECH_RECOGNITION_MODE,
)
job_result = batch_api.redact_audio_advanced_batch_job(body=request)
if not job_result.successful:
raise RuntimeError(f"Failed to submit batch job: {job_result}")
if not job_result.async_job_id:
raise RuntimeError(f"Batch job response did not include async_job_id: {job_result}")
return job_result.async_job_id
def poll_until_complete(batch_api, async_job_id: str):
start_time = time.time()
while True:
status = batch_api.get_async_job_status(async_job_id=async_job_id)
if not status.successful:
raise RuntimeError(f"Failed to get batch job status: {status.error_message or status}")
print(f"Job {async_job_id} status: {status.async_job_status}")
if status.async_job_status == "COMPLETED":
if status.redact_audio_advanced_result is None:
raise RuntimeError(f"Job completed but redact_audio_advanced_result was empty: {status}")
return status
if status.error_message:
raise RuntimeError(f"Batch job returned error: {status.error_message}")
if time.time() - start_time > POLL_TIMEOUT_SECONDS:
raise TimeoutError(f"Timed out waiting for batch job {async_job_id}")
time.sleep(POLL_INTERVAL_SECONDS)
def main() -> None:
api_client = make_api_client()
batch_api = cloudmersive_dlp_api_client.TasksBatchJobApi(api_client)
try:
async_job_id = submit_redaction_job(batch_api, INPUT_FILE_PATH)
print(f"Submitted batch job. AsyncJobID: {async_job_id}")
final_status = poll_until_complete(batch_api, async_job_id)
redaction_result = final_status.redact_audio_advanced_result
if not redaction_result.redacted_audio:
raise RuntimeError("Completed job did not include redacted_audio.")
write_base64_file(OUTPUT_WAV_PATH, redaction_result.redacted_audio)
status_json = final_status.to_dict()
# Avoid duplicating the large audio payload in the JSON output file.
if (
status_json.get("redact_audio_advanced_result")
and "redacted_audio" in status_json["redact_audio_advanced_result"]
):
status_json["redact_audio_advanced_result"].pop("redacted_audio", None)
status_json["redacted_audio_output_file"] = str(Path(OUTPUT_WAV_PATH).resolve())
write_json(OUTPUT_JSON_PATH, status_json)
print(f"Redacted WAV saved to: {OUTPUT_WAV_PATH}")
print(f"Result JSON saved to: {OUTPUT_JSON_PATH}")
print(f"CleanResult: {redaction_result.clean_result}")
print(f"RedactedTranscript: {redaction_result.redacted_transcript}")
except ApiException as e:
print("Cloudmersive API error")
print(f"Status: {getattr(e, 'status', None)}")
print(f"Reason: {getattr(e, 'reason', None)}")
print(f"Body: {getattr(e, 'body', None)}")
raise
if __name__ == "__main__":
main()