Skip to main content

Audio

Process audio input and generate audio output, enabling voice-based applications, transcription, and text-to-speech.

Prerequisites

pip install apertis

Get your API Key from Apertis

Speech to Text (Transcription)

from apertis import Apertis

def main():
client = Apertis()

with open("audio.mp3", "rb") as audio_file:
response = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file
)

print(response.text)

if __name__ == "__main__":
main()

Transcription with Options

from apertis import Apertis

def main():
client = Apertis()

with open("audio.mp3", "rb") as audio_file:
response = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
language="en", # Specify language for better accuracy
prompt="Technical discussion about AI and machine learning", # Context hint
response_format="verbose_json", # Get detailed output
temperature=0.0 # Lower = more deterministic
)

print(f"Text: {response.text}")

if hasattr(response, 'segments'):
print("\nSegments:")
for segment in response.segments:
print(f" [{segment.start:.2f}s - {segment.end:.2f}s] {segment.text}")

if __name__ == "__main__":
main()

Translation (Audio to English)

from apertis import Apertis

def main():
client = Apertis()

# Translate non-English audio to English text
with open("spanish_audio.mp3", "rb") as audio_file:
response = client.audio.translations.create(
model="whisper-1",
file=audio_file
)

print(f"English translation: {response.text}")

if __name__ == "__main__":
main()

Text to Speech

from apertis import Apertis

def main():
client = Apertis()

response = client.audio.speech.create(
model="tts-1",
voice="alloy",
input="Hello! Welcome to Apertis AI. How can I help you today?"
)

# Save to file
with open("output.mp3", "wb") as f:
f.write(response.content)

print("Audio saved to output.mp3")

if __name__ == "__main__":
main()

High Quality Text to Speech

from apertis import Apertis

def main():
client = Apertis()

response = client.audio.speech.create(
model="tts-1-hd", # High-definition model
voice="nova", # Different voice
input="This is high quality text-to-speech audio.",
response_format="opus", # Opus format for better quality
speed=1.0 # Normal speed (0.25 to 4.0)
)

with open("output.opus", "wb") as f:
f.write(response.content)

print("HD audio saved to output.opus")

if __name__ == "__main__":
main()

Streaming Text to Speech

from apertis import Apertis

def main():
client = Apertis()

response = client.audio.speech.create(
model="tts-1",
voice="shimmer",
input="This audio is being streamed as it's generated.",
)

# Stream to file
with open("streamed_output.mp3", "wb") as f:
for chunk in response.iter_bytes():
f.write(chunk)

print("Streamed audio saved to streamed_output.mp3")

if __name__ == "__main__":
main()

Audio in Chat (Multimodal)

import base64
from apertis import Apertis

def main():
client = Apertis()

# Read and encode audio file
with open("question.mp3", "rb") as f:
audio_data = base64.standard_b64encode(f.read()).decode("utf-8")

response = client.chat.completions.create(
model="gpt-4.1-audio-preview",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Please respond to this audio message:"},
{
"type": "input_audio",
"input_audio": {
"data": audio_data,
"format": "mp3"
}
}
]
}
]
)

print(response.choices[0].message.content)

if __name__ == "__main__":
main()

Batch Transcription

import asyncio
from pathlib import Path
from apertis import AsyncApertis

async def transcribe_file(client: AsyncApertis, file_path: str) -> dict:
"""Transcribe a single audio file."""
with open(file_path, "rb") as f:
response = await client.audio.transcriptions.create(
model="whisper-1",
file=f
)
return {"file": file_path, "text": response.text}

async def main():
client = AsyncApertis()

# List of audio files to process
audio_files = list(Path("audio_folder").glob("*.mp3"))

# Process files concurrently
tasks = [transcribe_file(client, str(f)) for f in audio_files]
results = await asyncio.gather(*tasks)

for result in results:
print(f"\n{result['file']}:")
print(f" {result['text'][:100]}...")

if __name__ == "__main__":
asyncio.run(main())

Supported Models

Transcription/Translation

ModelDescription
whisper-1OpenAI Whisper for speech recognition

Text to Speech

ModelDescription
tts-1Standard quality, fast
tts-1-hdHigh definition quality

Available Voices

VoiceDescription
alloyNeutral, balanced
echoWarm, conversational
fableExpressive, narrative
onyxDeep, authoritative
novaFriendly, upbeat
shimmerClear, professional

API Reference

Transcription Parameters

ParameterTypeDescription
filefileAudio file (required)
modelstrModel identifier (required)
languagestrISO language code
promptstrContext hint for transcription
response_formatstr"json", "text", "verbose_json", "srt", "vtt"
temperaturefloatSampling temperature

Speech Parameters

ParameterTypeDescription
modelstrTTS model (required)
voicestrVoice selection (required)
inputstrText to convert (required)
response_formatstr"mp3", "opus", "aac", "flac"
speedfloatSpeed multiplier (0.25 to 4.0)

Supported Audio Formats

  • MP3
  • MP4
  • MPEG
  • MPGA
  • M4A
  • WAV
  • WebM