語音合成
說明
支持的領域 / 任務:audio(音頻) / tts(語音合成)。
語音合成API基于達摩院改良的自回歸韻律模型,支持文本至語音的實時流式合成。可被應用于:
智能設備/機器人播報的語音內容,如智能客服機器人、智能音箱、數字人等。
音視頻創作中需要將文字轉為語音播報的場景,如小說閱讀、新聞播報、影視解說、配音等。
前提條件
已開通服務并獲得API-KEY:開通DashScope并創建API-KEY。
已安裝最新版SDK:安裝DashScope SDK。
示例代碼
將合成音頻保存為文件
以下代碼展示了將流式返回的二進制音頻,保存為本地文件。
說明
需要使用您的api-key替換示例中的 your-dashscope-api-key ,代碼才能正常運行。
# coding=utf-8
import dashscope
from dashscope.audio.tts import SpeechSynthesizer
dashscope.api_key='your-dashscope-api-key'
result = SpeechSynthesizer.call(model='sambert-zhichu-v1',
text='今天天氣怎么樣',
sample_rate=48000)
if result.get_audio_data() is not None:
with open('output.wav', 'wb') as f:
f.write(result.get_audio_data())
package com.alibaba.dashscope.sample;
import com.alibaba.dashscope.audio.tts.SpeechSynthesizer;
import com.alibaba.dashscope.audio.tts.SpeechSynthesisParam;
import com.alibaba.dashscope.audio.tts.SpeechSynthesisResult;
import com.alibaba.dashscope.audio.tts.SpeechSynthesisAudioFormat;
import com.alibaba.dashscope.common.ResultCallback;
import com.alibaba.dashscope.common.Status;
import java.io.*;
import java.nio.ByteBuffer;
public class Main {
public static void SyncAudioDataToFile() {
SpeechSynthesizer synthesizer = new SpeechSynthesizer();
SpeechSynthesisParam param =
SpeechSynthesisParam.builder()
.apiKey("your-dashscope-api-key")
.model("sambert-zhichu-v1")
.text("今天天氣怎么樣")
.sampleRate(48000)
.build();
File file = new File("output.wav");
// 調用call方法,傳入param參數,獲取合成音頻
ByteBuffer audio = synthesizer.call(param);
try (FileOutputStream fos = new FileOutputStream(file)) {
fos.write(audio.array());
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public static void main(String[] args) {
SyncAudioDataToFile();
System.exit(0);
}
}
將合成音頻通過設備播放
調用成功后,通過本地設備播放實時返回的音頻內容。
說明
需要使用您的api-key替換示例中的 your-dashscope-api-key ,代碼才能正常運行。
運行Python示例前,需要通過pip安裝第三方音頻播放套件。
# coding=utf-8
#
# Installation instructions for pyaudio:
# APPLE Mac OS X
# brew install portaudio
# pip install pyaudio
# Debian/Ubuntu
# sudo apt-get install python-pyaudio python3-pyaudio
# or
# pip install pyaudio
# CentOS
# sudo yum install -y portaudio portaudio-devel && pip install pyaudio
# Microsoft Windows
# python -m pip install pyaudio
import dashscope
import sys
import pyaudio
from dashscope.api_entities.dashscope_response import SpeechSynthesisResponse
from dashscope.audio.tts import ResultCallback, SpeechSynthesizer, SpeechSynthesisResult
dashscope.api_key='your-dashscope-api-key'
class Callback(ResultCallback):
_player = None
_stream = None
def on_open(self):
print('Speech synthesizer is opened.')
self._player = pyaudio.PyAudio()
self._stream = self._player.open(
format=pyaudio.paInt16,
channels=1,
rate=48000,
output=True)
def on_complete(self):
print('Speech synthesizer is completed.')
def on_error(self, response: SpeechSynthesisResponse):
print('Speech synthesizer failed, response is %s' % (str(response)))
def on_close(self):
print('Speech synthesizer is closed.')
self._stream.stop_stream()
self._stream.close()
self._player.terminate()
def on_event(self, result: SpeechSynthesisResult):
if result.get_audio_frame() is not None:
print('audio result length:', sys.getsizeof(result.get_audio_frame()))
self._stream.write(result.get_audio_frame())
if result.get_timestamp() is not None:
print('timestamp result:', str(result.get_timestamp()))
callback = Callback()
SpeechSynthesizer.call(model='sambert-zhichu-v1',
text='今天天氣怎么樣',
sample_rate=48000,
format='pcm',
callback=callback)
package com.alibaba.dashscope.sample;
import com.alibaba.dashscope.audio.tts.SpeechSynthesizer;
import com.alibaba.dashscope.audio.tts.SpeechSynthesisAudioFormat;
import com.alibaba.dashscope.audio.tts.SpeechSynthesisParam;
import com.alibaba.dashscope.audio.tts.SpeechSynthesisResult;
import com.alibaba.dashscope.common.ResultCallback;
import com.alibaba.dashscope.common.Status;
import java.io.*;
import java.nio.ByteBuffer;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.atomic.AtomicBoolean;
import javax.sound.sampled.*;
public class Main {
public static void StreamAuidoDataToSpeaker() {
CountDownLatch latch = new CountDownLatch(1);
SpeechSynthesizer synthesizer = new SpeechSynthesizer();
SpeechSynthesisParam param =
SpeechSynthesisParam.builder()
.apiKey("your-dashscope-api-key") // 需要替換成您實際的apikey
.text("今天天氣怎么樣")
.model("sambert-zhichu-v1")
.sampleRate(48000)
.format(SpeechSynthesisAudioFormat.PCM) // 流式合成使用PCM或者MP3
.build();
// 播放線程
class PlaybackRunnable implements Runnable {
// 設置音頻格式,請根據實際自身設備,合成音頻參數和平臺選擇配置
// 這里選擇48k16bit單通道,建議客戶根據選用的模型采樣率情況和自身設備兼容性選擇其他采樣率和格式
private AudioFormat af = new AudioFormat(48000, 16, 1, true, false);
private DataLine.Info info = new DataLine.Info(SourceDataLine.class, af);
private SourceDataLine targetSource = null;
private AtomicBoolean runFlag = new AtomicBoolean(true);
private ConcurrentLinkedQueue<ByteBuffer> queue = new ConcurrentLinkedQueue<>();
// 準備播放器
public void prepare() throws LineUnavailableException {
targetSource = (SourceDataLine) AudioSystem.getLine(info);
targetSource.open(af, 4096);
targetSource.start();
}
public void put(ByteBuffer buffer) {
queue.add(buffer);
}
// 停止播放
public void stop() {
runFlag.set(false);
}
@Override
public void run() {
if (targetSource == null) {
return;
}
while (runFlag.get()) {
if (queue.isEmpty()) {
try {
Thread.sleep(100);
} catch (InterruptedException e) {
}
continue;
}
ByteBuffer buffer = queue.poll();
if (buffer == null) {
continue;
}
byte[] data = buffer.array();
targetSource.write(data, 0, data.length);
}
// 將緩存全部播放完
if (!queue.isEmpty()) {
ByteBuffer buffer = null;
while ((buffer = queue.poll()) != null) {
byte[] data = buffer.array();
targetSource.write(data, 0, data.length);
}
}
// 釋放播放器
targetSource.drain();
targetSource.stop();
targetSource.close();
}
}
// 創建一個繼承自ResultCallback<SpeechSynthesisResult>的子類來實現回調接口
class ReactCallback extends ResultCallback<SpeechSynthesisResult> {
private PlaybackRunnable playbackRunnable = null;
public ReactCallback(PlaybackRunnable playbackRunnable) {
this.playbackRunnable = playbackRunnable;
}
// 當服務側返回流式合成結果后回調
@Override
public void onEvent(SpeechSynthesisResult result) {
// 通過getAudio獲取流式結果二進制數據
if (result.getAudioFrame() != null) {
// 將數據流式推給播放器
playbackRunnable.put(result.getAudioFrame());
}
}
// 當服務側完成合成后回調
@Override
public void onComplete() {
// 告知播放線程結束
playbackRunnable.stop();
latch.countDown();
}
// 當出現錯誤時回調
@Override
public void onError(Exception e) {
// 告訴播放線程結束
System.out.println(e);
playbackRunnable.stop();
latch.countDown();
}
}
PlaybackRunnable playbackRunnable = new PlaybackRunnable();
try {
playbackRunnable.prepare();
} catch (LineUnavailableException e) {
throw new RuntimeException(e);
}
Thread playbackThread = new Thread(playbackRunnable);
// 啟動播放線程
playbackThread.start();
// 帶Callback的call方法將不會阻塞當前線程
synthesizer.call(param, new ReactCallback(playbackRunnable));
// 等待合成完成
try {
latch.await();
// 等待播放線程全部播放完
playbackThread.join();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
public static void main(String[] args) {
StreamAuidoDataToSpeaker();
System.exit(0);
}
}
了解更多
文檔內容是否對您有幫助?