Paraformer語音識別
支持的領域 / 任務:audio(音頻) / asr(語音識別)
Paraformer語音識別API基于達摩院新一代非自回歸端到端模型,提供基于實時音頻流的語音識別以及對輸入的各類音視頻文件進行語音識別的能力。可被應用于:
對語音識別結果返回的即時性有嚴格要求的實時場景,如實時會議記錄、實時直播字幕、電話客服等。
對音視頻文件中語音內容的識別,從而進行內容理解分析、字幕生成等。
對電話客服呼叫中心錄音進行識別,從而進行客服質檢等。
快速開始
前提條件
已開通服務并獲得API-KEY:開通DashScope并創建API-KEY。
已安裝最新版SDK:安裝DashScope SDK。
實時語音識別示例代碼
實時語音識別是對不限時長的音頻流做實時識別,達到“邊說邊出文字”的效果,內置智能斷句,可提供每句話開始結束時間。可用于視頻實時直播字幕、實時會議記錄、實時法庭庭審記錄、智能語音助手等場景。
使用麥克風進行流式語音文字上屏
以下示例展示使用實時語音識別API,使用麥克風進行流式語音識別并進行文字上屏,達到“邊說邊出文字”的效果。
需要使用您的API-KEY替換示例中的 your-dashscope-api-key ,代碼才能正常運行。
運行Python示例前,需要通過pip install pyaudio命令安裝第三方音頻播放與采集套件。
# For prerequisites running the following sample, visit http://bestwisewords.com/document_detail/611472.html
import pyaudio
import dashscope
from dashscope.audio.asr import (Recognition, RecognitionCallback,
RecognitionResult)
dashscope.api_key='<your-dashscope-api-key>'
mic = None
stream = None
class Callback(RecognitionCallback):
def on_open(self) -> None:
global mic
global stream
print('RecognitionCallback open.')
mic = pyaudio.PyAudio()
stream = mic.open(format=pyaudio.paInt16,
channels=1,
rate=16000,
input=True)
def on_close(self) -> None:
global mic
global stream
print('RecognitionCallback close.')
stream.stop_stream()
stream.close()
mic.terminate()
stream = None
mic = None
def on_event(self, result: RecognitionResult) -> None:
print('RecognitionCallback sentence: ', result.get_sentence())
callback = Callback()
recognition = Recognition(model='paraformer-realtime-v1',
format='pcm',
sample_rate=16000,
callback=callback)
recognition.start()
while True:
if stream:
data = stream.read(3200, exception_on_overflow = False)
recognition.send_audio_frame(data)
else:
break
recognition.stop()
package com.alibaba.dashscope.sample.recognition.quickstart;
import com.alibaba.dashscope.audio.asr.recognition.Recognition;
import com.alibaba.dashscope.audio.asr.recognition.RecognitionParam;
import io.reactivex.BackpressureStrategy;
import io.reactivex.Flowable;
import java.nio.ByteBuffer;
import javax.sound.sampled.AudioFormat;
import javax.sound.sampled.AudioSystem;
import javax.sound.sampled.TargetDataLine;
public class Main {
public static void main(String[] args) {
// 創建一個Flowable<ByteBuffer>
Flowable<ByteBuffer> audioSource =
Flowable.create(
emitter -> {
new Thread(
() -> {
try {
// 創建音頻格式
AudioFormat audioFormat = new AudioFormat(16000, 16, 1, true, false);
// 根據格式匹配默認錄音設備
TargetDataLine targetDataLine =
AudioSystem.getTargetDataLine(audioFormat);
targetDataLine.open(audioFormat);
// 開始錄音
targetDataLine.start();
ByteBuffer buffer = ByteBuffer.allocate(1024);
long start = System.currentTimeMillis();
// 錄音30s并進行實時轉寫
while (System.currentTimeMillis() - start < 30000) {
int read = targetDataLine.read(buffer.array(), 0, buffer.capacity());
if (read > 0) {
buffer.limit(read);
// 將錄音音頻數據發送給流式識別服務
emitter.onNext(buffer);
buffer = ByteBuffer.allocate(1024);
// 錄音速率有限,防止cpu占用過高,休眠一小會兒
Thread.sleep(20);
}
}
// 通知結束轉寫
emitter.onComplete();
} catch (Exception e) {
emitter.onError(e);
}
})
.start();
},
BackpressureStrategy.BUFFER);
// 創建Recognizer
Recognition recognizer = new Recognition();
// 創建RecognitionParam,audioFrames參數中傳入上面創建的Flowable<ByteBuffer>
RecognitionParam param =
RecognitionParam.builder()
.model("paraformer-realtime-v1")
.format("pcm")
.sampleRate(16000)
.apiKey("your-dashscope-api-key")
.build();
// 流式調用接口
// 對于java sdk 2.0.0及以上版本,streamCall需要
// catch ApiException和NoApiKeyException,對于
// 小于2.0.0的版本則不需要
try {
recognizer
.streamCall(param, audioSource)
// 調用Flowable的subscribe方法訂閱結果
.blockingForEach(
result -> {
// 打印最終結果
if (result.isSentenceEnd()) {
System.out.println("Fix:" + result.getSentence().getText());
} else {
System.out.println("Result:" + result.getSentence().getText());
}
});
} catch (Exception e) {
e.printStackTrace();
}
System.exit(0);
}
}
更多詳細案例可參考語音識別實現音視頻文件轉寫及實時文字上屏功能。
使用同步接口進行文件轉寫
以下示例展示使用語音識別同步API接口進行文件轉寫,對于對話聊天、控制口令、語音輸入法、語音搜索等較短的準實時語音識別場景可考慮采用該接口進行語音識別。
# For prerequisites running the following sample, visit http://bestwisewords.com/document_detail/611472.html
import requests
import dashscope
from dashscope.audio.asr import Recognition
dashscope.api_key = '<your-dashscope-api-key>'
r = requests.get(
'https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/paraformer/realtime_asr_example.wav'
)
with open('asr_example.wav', 'wb') as f:
f.write(r.content)
recognition = Recognition(model='paraformer-realtime-v1',
format='wav',
sample_rate=16000,
callback=None)
result = recognition.call('asr_example.wav')
with open('asr_result.txt', 'w+') as f:
for sentence in result.get_sentence():
f.write(str(sentence) + '\n')
package com.alibaba.dashscope.sample.recognition.quickstart;
import com.alibaba.dashscope.audio.asr.recognition.Recognition;
import com.alibaba.dashscope.audio.asr.recognition.RecognitionParam;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
public class Main {
public static void main(String[] args) {
// 用戶可忽略url下載文件部分,可以直接使用本地文件進行相關api調用進行識別
String exampleWavUrl =
"https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/paraformer/realtime_asr_example.wav";
try {
InputStream in = new URL(exampleWavUrl).openStream();
Files.copy(in, Paths.get("asr_example.wav"), StandardCopyOption.REPLACE_EXISTING);
} catch (IOException e) {
System.out.println("error: " + e);
System.exit(1);
}
// 創建Recognition實例
Recognition recognizer = new Recognition();
// 創建RecognitionParam,請在實際使用中替換真實apiKey
RecognitionParam param =
RecognitionParam.builder()
.model("paraformer-realtime-v1")
.format("wav")
.sampleRate(16000)
.apiKey("your-dashscope-api-key")
.build();
// 直接將結果保存到script.txt中
try (FileOutputStream fos = new FileOutputStream("asr_result.txt")) {
String result = recognizer.call(param, new File("asr_example.wav"));
System.out.println(result);
fos.write(result.getBytes());
} catch (Exception e) {
e.printStackTrace();
}
System.exit(0);
}
}
調用成功后,實時識別的返回結果示例如下
{
"begin_time": 0,
"end_time": 5460,
"text": "這是由阿里巴巴達摩院語音實驗室提供的實時語音識別技術。",
"words": [
{
"begin_time": 0,
"end_time": 420,
"text": "這是",
"punctuation": ""
},
{
"begin_time": 420,
"end_time": 840,
"text": "由阿",
"punctuation": ""
},
{
"begin_time": 840,
"end_time": 1260,
"text": "里巴",
"punctuation": ""
},
{
"begin_time": 1260,
"end_time": 1680,
"text": "巴達",
"punctuation": ""
},
{
"begin_time": 1680,
"end_time": 2100,
"text": "摩院",
"punctuation": ""
},
{
"begin_time": 2100,
"end_time": 2520,
"text": "語音",
"punctuation": ""
},
{
"begin_time": 2520,
"end_time": 2940,
"text": "實驗",
"punctuation": ""
},
{
"begin_time": 2940,
"end_time": 3360,
"text": "室提",
"punctuation": ""
},
{
"begin_time": 3360,
"end_time": 3780,
"text": "供的",
"punctuation": ""
},
{
"begin_time": 3780,
"end_time": 4200,
"text": "實時",
"punctuation": ""
},
{
"begin_time": 4200,
"end_time": 4620,
"text": "語音",
"punctuation": ""
},
{
"begin_time": 4620,
"end_time": 5040,
"text": "識別",
"punctuation": ""
},
{
"begin_time": 5040,
"end_time": 5460,
"text": "技術",
"punctuation": "。"
}
]
}
異步文件轉寫示例代碼
以下示例展示了調用Paraformer語音識別文件轉寫異步API,對多個通過URL給出的音頻文件進行語音識別批處理的代碼。
需要使用您的API-KEY替換示例中的 your-dashscope-api-key ,代碼才能正常運行。
# For prerequisites running the following sample, visit http://bestwisewords.com/document_detail/611472.html
import json
from urllib import request
import dashscope
dashscope.api_key='your-dashscope-api-key'
task_response = dashscope.audio.asr.Transcription.async_call(
model='paraformer-v1',
file_urls=[
'https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/paraformer/hello_world_female.wav',
'https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/paraformer/hello_world_male.wav'
])
transcription_response = dashscope.audio.asr.Transcription.wait(
task=task_response.output.task_id)
for transcription in transcription_response.output['results']:
url = transcription['transcription_url']
result = json.loads(request.urlopen(url).read().decode('utf8'))
print(json.dumps(result, indent=4, ensure_ascii=False))
package com.alibaba.dashscope.sample.transcription;
import com.alibaba.dashscope.audio.asr.transcription.*;
import com.google.gson.*;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.*;
import java.net.HttpURLConnection;
import java.util.Arrays;
import java.util.List;
public class Main {
public static void main(String[] args) {
// 創建轉寫請求參數,需要用真實apikey替換your-dashscope-api-key
TranscriptionParam param =
TranscriptionParam.builder()
.apiKey("your-dashscope-api-key")
.model("paraformer-v1")
.fileUrls(
Arrays.asList(
"https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/paraformer/hello_world_female.wav",
"https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/paraformer/hello_world_male.wav"))
.build();
try {
Transcription transcription = new Transcription();
// 提交轉寫請求
TranscriptionResult result = transcription.asyncCall(param);
// 等待轉寫完成
result = transcription.wait(
TranscriptionQueryParam.FromTranscriptionParam(param, result.getTaskId()));
// 獲取轉寫結果
List<TranscriptionTaskResult> taskResultList = result.getResults();
if (taskResultList != null && taskResultList.size() > 0) {
for (TranscriptionTaskResult taskResult : taskResultList) {
String transcriptionUrl = taskResult.getTranscriptionUrl();
HttpURLConnection connection =
(HttpURLConnection) new URL(transcriptionUrl).openConnection();
connection.setRequestMethod("GET");
connection.connect();
BufferedReader reader =
new BufferedReader(new InputStreamReader(connection.getInputStream()));
Gson gson = new GsonBuilder().setPrettyPrinting().create();
System.out.println(gson.toJson(gson.fromJson(reader, JsonObject.class)));
}
}
} catch (Exception e) {
System.out.println("error: " + e);
}
System.exit(0);
}
}
通過URL指定進行語音轉寫的文件,其大小不超過2GB。
file_urls 參數支持傳入多個文件URL,示例中展示了對多個文件URL進行轉寫的功能。
調用成功后,將會返回例如以下示例的文件轉寫結果。
{
"file_url": "https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/paraformer/hello_world_male.wav",
"properties": {
"audio_format": "pcm_s16le",
"channels": [
0
],
"original_sampling_rate": 16000,
"original_duration_in_milliseconds": 3874
},
"transcripts": [
{
"channel_id": 0,
"content_duration_in_milliseconds": 3540,
"text": "Hello, world, 來自阿里巴巴達摩院語音實驗室。",
"sentences": [
{
"begin_time": 60,
"end_time": 3600,
"text": "Hello, world, 來自阿里巴巴達摩院語音實驗室。",
"words": [
{
"begin_time": 60,
"end_time": 660,
"text": "Hello",
"punctuation": ", "
},
{
"begin_time": 660,
"end_time": 1080,
"text": "world",
"punctuation": ", "
},
{
"begin_time": 1080,
"end_time": 1440,
"text": "來自",
"punctuation": ""
},
{
"begin_time": 1440,
"end_time": 1800,
"text": "阿里",
"punctuation": ""
},
{
"begin_time": 1800,
"end_time": 2160,
"text": "巴巴",
"punctuation": ""
},
{
"begin_time": 2160,
"end_time": 2520,
"text": "達摩",
"punctuation": ""
},
{
"begin_time": 2520,
"end_time": 2880,
"text": "院語",
"punctuation": ""
},
{
"begin_time": 2880,
"end_time": 3240,
"text": "音實",
"punctuation": ""
},
{
"begin_time": 3240,
"end_time": 3600,
"text": "驗室",
"punctuation": "。"
}
]
}
]
}
]
}
{
"file_url": "https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/paraformer/hello_world_female.wav",
"properties": {
"audio_format": "pcm_s16le",
"channels": [
0
],
"original_sampling_rate": 16000,
"original_duration_in_milliseconds": 4087
},
"transcripts": [
{
"channel_id": 0,
"content_duration_in_milliseconds": 3780,
"text": "Hello, world, 來自阿里巴巴達摩院語音實驗室。",
"sentences": [
{
"begin_time": 60,
"end_time": 3840,
"text": "Hello, world, 來自阿里巴巴達摩院語音實驗室。",
"words": [
{
"begin_time": 60,
"end_time": 780,
"text": "Hello",
"punctuation": ", "
},
{
"begin_time": 780,
"end_time": 1320,
"text": "world",
"punctuation": ", "
},
{
"begin_time": 1320,
"end_time": 1680,
"text": "來自",
"punctuation": ""
},
{
"begin_time": 1680,
"end_time": 2040,
"text": "阿里",
"punctuation": ""
},
{
"begin_time": 2040,
"end_time": 2400,
"text": "巴巴",
"punctuation": ""
},
{
"begin_time": 2400,
"end_time": 2760,
"text": "達摩",
"punctuation": ""
},
{
"begin_time": 2760,
"end_time": 3120,
"text": "院語",
"punctuation": ""
},
{
"begin_time": 3120,
"end_time": 3480,
"text": "音實",
"punctuation": ""
},
{
"begin_time": 3480,
"end_time": 3840,
"text": "驗室",
"punctuation": "。"
}
]
}
]
}
]
}
了解更多
有關Paraformer語音識別模型服務的實時語音識別API以及錄音文件轉寫的詳細調用方法,可前往實時語音識別API詳情和錄音文件識別API詳情頁面進行了解。