Build real-time voice agents with streaming text-to-speech
import { io } from 'socket.io-client'; import { v4 as uuidv4 } from 'uuid'; class ConversationalAgent { constructor(apiKey) { this.apiKey = apiKey; this.socket = null; this.audioQueue = new Map(); this.currentRequestId = null; } connect() { this.socket = io('wss://api.upliftai.org/text-to-speech/multi-stream', { auth: { token: this.apiKey }, transports: ['websocket'] }); this.socket.on('message', this.handleMessage.bind(this)); return new Promise((resolve) => { this.socket.once('message', (data) => { if (data.type === 'ready') { console.log('WebSocket connected:', data.sessionId); resolve(); } }); }); } handleMessage(data) { switch(data.type) { case 'audio_start': console.log(`Starting audio for: ${data.requestId}`); this.audioQueue.set(data.requestId, []); break; case 'audio': // Collect audio chunks const chunks = this.audioQueue.get(data.requestId) || []; chunks.push(Buffer.from(data.audio, 'base64')); this.audioQueue.set(data.requestId, chunks); // Start playing after first few chunks (for lower latency) if (chunks.length === 2) { this.startPlayback(data.requestId); } break; case 'audio_end': console.log(`Audio complete for: ${data.requestId}`); this.finalizePlayback(data.requestId); break; case 'error': console.error(`Error for ${data.requestId}:`, data.message); this.audioQueue.delete(data.requestId); break; } } async speak(text, voiceId = 'v_meklc281') { const requestId = uuidv4(); this.currentRequestId = requestId; // Cancel any ongoing synthesis if (this.currentRequestId) { this.cancel(this.currentRequestId); } // Start new synthesis this.socket.emit('synthesize', { type: 'synthesize', requestId: requestId, text: text, voiceId: voiceId, outputFormat: 'MP3_22050_32' }); return requestId; } cancel(requestId) { if (requestId) { this.socket.emit('cancel', { type: 'cancel', requestId: requestId }); this.audioQueue.delete(requestId); } } startPlayback(requestId) { // In a real app, stream to audio player // This is where you'd start playing the audio console.log(`Starting playback for ${requestId}`); } finalizePlayback(requestId) { const chunks = this.audioQueue.get(requestId); if (chunks) { const fullAudio = Buffer.concat(chunks); console.log(`Final audio size: ${fullAudio.length} bytes`); // Play or save the complete audio this.audioQueue.delete(requestId); } } disconnect() { if (this.socket) { this.socket.disconnect(); } } } // Usage Example async function main() { const agent = new ConversationalAgent('sk_api_your_key'); // Connect to WebSocket await agent.connect(); // Simulate a conversation const conversation = [ "سلام! میں آپ کی مدد کیسے کر سکتی ہوں؟", "آج موسم بہت خوبصورت ہے۔", "کیا آپ کو کوئی سوال پوچھنا ہے؟" ]; for (const text of conversation) { await agent.speak(text); // Wait for response (in real app, this would be user input) await new Promise(resolve => setTimeout(resolve, 3000)); } agent.disconnect(); } main().catch(console.error);
class MultiVoiceConversation { constructor(apiKey) { this.agent = new ConversationalAgent(apiKey); this.voices = { assistant: 'v_meklc281', // Urdu female narrator: 'v_30s70t3a', // News voice character: 'v_yypgzenx' // Dada Jee }; } async playDialogue(script) { await this.agent.connect(); for (const line of script) { const { speaker, text } = line; const voiceId = this.voices[speaker] || this.voices.assistant; console.log(`${speaker}: ${text}`); await this.agent.speak(text, voiceId); // Wait for audio to finish (simplified) await new Promise(resolve => setTimeout(resolve, 2000)); } this.agent.disconnect(); } } // Example dialogue const dialogue = [ { speaker: 'narrator', text: 'ایک دن کی بات ہے...' }, { speaker: 'assistant', text: 'کیا آپ نے یہ کہانی سنی ہے؟' }, { speaker: 'character', text: 'ہاں بیٹا، یہ بہت پرانی کہانی ہے۔' } ]; const conversation = new MultiVoiceConversation('sk_api_your_key'); conversation.playDialogue(dialogue);
import socketio import asyncio import base64 class WebSocketClient: def __init__(self, api_key, base_url): self.api_key = api_key self.base_url = base_url self.sio = None self.audio_callbacks = {} async def connect(self): self.sio = socketio.AsyncClient() # Register handlers self.sio.on('message', self._on_message) # Connect with auth await self.sio.connect( self.base_url, auth={'token': self.api_key}, namespaces=['/text-to-speech/multi-stream'] ) async def synthesize(self, text, voice_id='v_meklc281'): request_id = str(uuid.uuid4()) audio_queue = asyncio.Queue() self.audio_callbacks[request_id] = audio_queue # Send synthesis request await self.sio.emit('synthesize', { 'type': 'synthesize', 'requestId': request_id, 'text': text, 'voiceId': voice_id, 'outputFormat': 'MP3_22050_32' }) return audio_queue async def _on_message(self, data): if data['type'] == 'audio': request_id = data['requestId'] if request_id in self.audio_callbacks: audio_bytes = base64.b64decode(data['audio']) await self.audio_callbacks[request_id].put(audio_bytes) elif data['type'] == 'audio_end': request_id = data['requestId'] if request_id in self.audio_callbacks: await self.audio_callbacks[request_id].put(None) del self.audio_callbacks[request_id]