本文共 13960 字,大约阅读时间需要 46 分钟。
本文参考链接:
之前分别用了百度和科大讯飞的接口来实现音频文件的转写。 百度的几乎是免费的,但是效果实在有点差强人意,科大讯飞的效果倒是很不错,但是有一点,“贵”。于是乎,想着有没有一个折中的方案:识别效果不能太差,但是价钱也最好不要太贵。转来转去,于是转到腾讯这儿来了。
腾讯的语音识别也是有 Python SDK的,链接在此:,Demo的下载地址在这个页面就有,不过暂时只有Python2的,虽说它自己也说后续会支持Python3,但是如果能早点用Python3的话,又何乐而不为?只不过Demo改改确实需要踩很多坑。
它的Demo里面主要由三个Python文件,Config.py, OfflineCLient.py, offlineSdk.py。针对在Python3环境下需要做的修改,我进行部分说明:
Config.py 其实没什么好改的,主要是 SECRET_KEY, SECRETID, APPID 用自己申请的填好就是
# -*- coding:utf-8 -*-'''Created on 2019-4-28@author: iantang'''class Config: '全局变量配置信息,请按需求改成自己的配置' # ------------- Required,必须填写 --------------- # AppId, secretId, secretKey获取方法可参考截图: # https://cloud.tencent.com/document/product/441/6203 # 具体路径:点控制台右上角您的账号-->选:访问管理-->点左边菜单的:访问秘钥-->API秘钥管理 SECRET_KEY = '***' SECRETID = '*****' APPID = '***' # 我们会将识别结果通过Post方式发送至这个URL。用户需要先搭建好自己的用于接收post数据的服务。 CALLBACK_URL = "http://xxx.xxxxx.xxx" # 这个回调url是必填,至于怎么填,我后续再说 # ------------- optional,根据自身需求配置值 --------------- # 识别引擎 8k_0 or 16k_0 or 8k_6, 8k_6支持角色分离,所以我这儿用的这个识别引擎 ENGINE_MODEL_TYPE = '8k_6' # 1 or 2. 语音声道数。在电话 8k通用模型下支持 1和 2,其他模型仅支持 1声道 CHANNEL_NUM = 1 # 识别结果文本编码方式 0:UTF-8, 1:GB2312, 2:GBK, 3:BIG5 RES_TEXT_FORMAT = 0 # 语音数据来源。0:语音 URL;1:语音数据(post body) SOURCE_TYPE = 0 # ------------- optional,采用默认值即可 --------------- # 腾讯云项目 ID, 填0。也可改成用户的:控制台-账号中心-项目管理中的配置。 PROJECT_ID = 0 # 子服务类型。0:离线语音识别。 SUB_SERVICE_TYPE = 0 # 结果返回方式。0:同步返回;1:异步返回。目前只支持异步返回 RES_TYPE = 1 # 腾讯服务器的URL,通常无需修改。 REQUEST_URL = "https://aai.qcloud.com/asr/v1/" # 注册签名时用的URL,通常无需修改。 SIGN_URL = "aai.qcloud.com/asr/v1/" # ------------- 下面是初始化和验证方法,可跳过 --------------- def __init__(self): print ("") def verifyProperties(self): if len(str(self.SECRET_KEY)) == 0: print('SECRET_KEY can not empty') return if len(str(self.SECRETID)) == 0: print('SECRETID can not empty') return if len(str(self.APPID)) == 0: print('APPID can not empty') return if len(str(self.CALLBACK_URL)) == 0: print('CALLBACK_URL can not empty') return if len(str(self.ENGINE_MODEL_TYPE)) == 0 or ( str(self.ENGINE_MODEL_TYPE) != '8k_0' and str(self.ENGINE_MODEL_TYPE) != '16k_0' and str(self.ENGINE_MODEL_TYPE) != '8k_6'): print('ENGINE_MODEL_TYPE is not right') return if len(str(self.CHANNEL_NUM)) == 0 or (str(self.CHANNEL_NUM) != '0' and str(self.CHANNEL_NUM) != '1'): print('CHANNEL_NUM is not right') return if len(str(self.RES_TEXT_FORMAT)) == 0 or (str(self.RES_TEXT_FORMAT) != '0' and str(self.RES_TEXT_FORMAT) != '1' and str( self.RES_TEXT_FORMAT) != '2' and str(self.RES_TEXT_FORMAT) != '3'): print('RES_TEXT_FORMAT is not right') return if len(str(self.SOURCE_TYPE)) == 0 or (str(self.SOURCE_TYPE) != '0' and str(self.SOURCE_TYPE) != '1'): print('SOURCE_TYPE is not right') return if len(str(self.PROJECT_ID)) == 0: print('self.PROJECT_ID can not empty') return if len(str(self.SUB_SERVICE_TYPE)) == 0 or (str(self.SUB_SERVICE_TYPE) != '0' and str(self.SUB_SERVICE_TYPE) != '1'): print('SUB_SERVICE_TYPE is not right') return if len(str(self.RES_TYPE)) == 0 or (str(self.RES_TYPE) != '0' and str(self.RES_TYPE) != '1'): print('RES_TYPE is not right') return config = Config()config.verifyProperties()
offlineSdk.py 中修改的地方有一部分,我已经在源码中进行了注释
# -*- coding:utf-8 -*-import requestsimport hashlibimport timeimport hmacimport base64import urllibimport urllib.parse # 新增import Configdef task_process(audio_url): request_data = dict() request_data['channel_num'] = Config.config.CHANNEL_NUM request_data['secretid'] = Config.config.SECRETID request_data['engine_model_type'] = Config.config.ENGINE_MODEL_TYPE request_data['timestamp'] = int(time.time()) request_data['expired'] = int(time.time()) + 3600 request_data['nonce'] = 6666 request_data['projectid'] = Config.config.PROJECT_ID request_data['callback_url'] = Config.config.CALLBACK_URL request_data['res_text_format'] = Config.config.RES_TEXT_FORMAT request_data['res_type'] = Config.config.RES_TYPE request_data['source_type'] = Config.config.SOURCE_TYPE request_data['sub_service_type'] = Config.config.SUB_SERVICE_TYPE # request_data['url'] = urllib.quote(audio_url) 修改为下面的格式 request_data['url'] = audio_url authorization = generate_sign(request_data) task_req_url = generate_request(request_data) header = { "Content-Type": "application/json", # "Authorization": str(authorization) Python3中不用进行str转换,否则会认证失败 "Authorization": authorization } r = requests.post(task_req_url, headers=header, data=request_data) return r.textdef generate_sign(request_data): sign_str = "POST" + Config.config.SIGN_URL + str(Config.config.APPID) + "?" sort_dict = sorted(request_data.keys()) for key in sort_dict: # sign_str = sign_str + key + "=" + urllib.unquote(str(request_data[key])) + '&' urllib改为urllib.parse sign_str = sign_str + key + "=" + urllib.parse.unquote(str(request_data[key])) + '&' sign_str = sign_str[:-1] # authorization = base64.b64encode(hmac.new(Config.config.SECRET_KEY, sign_str, hashlib.sha1).digest()) 修改为下面这种 authorization = base64.b64encode(hmac.new(bytes(Config.config.SECRET_KEY, 'utf-8'), bytes(sign_str, 'utf-8'), hashlib.sha1).digest()) return authorizationdef generate_request(request_data): result_url = Config.config.REQUEST_URL + str(Config.config.APPID) + "?" for key in request_data: result_url = result_url + key + "=" + str(request_data[key]) + '&' result_url = result_url[:-1] return result_urlif __name__ == '__main__': # 语音 URL,公网可下载。当 source_type值为 0时须填写该字段,为 1时不填;长度大于 0,小于 2048 audio_url = "https://xuhai2-1255824371.cos.ap-chengdu.myqcloud.com/test.wav" task_process(audio_url)
OfflineCLient.py也没什么修改的, audio_url得用公网可访问的url地址,然后运行他就好了。
# -*- coding:utf-8 -*-import offlineSdkimport Config# 说明:请先将Config.py中的配置项按需改成自己的值,然后再开始使用。# 音频文件路径。每调用一次task_process方法,可发出一份请求。# 语音 URL,公网可下载。当 source_type值为 0时须填写该字段,为 1时不填;长度大于 0,小于 2048audio_url = "https://xuhai2-1255824371.cos.ap-chengdu.myqcloud.com/test.wav"# 调用语音识别函数获得识别结果result = offlineSdk.task_process(audio_url)print (result)# ------------------------------------------------------------------------------------# 若需中途调整参数值,可直接修改,然后继续发请求即可。比如:# Config.config.CALLBACK_URL = ""# Config.config.ENGINE_MODEL_TYPE = "16k_0"# ......# audio_url = "https://xuhai2-1255824371.cos.ap-chengdu.myqcloud.com/test.wav"# result = offlineSdk.task_process(audio_url)# print (result)
然后你会发现运行结果是这样的,看起来有个success,好像是成功了:
这个时候你肯定会问:我录音的识别结果在哪儿呢?怎么没返回回来?
你肯定还记得 Config.py 的配置里面有一个 CALLBACK_URL = "http://xxx.xxxxx.xxx" 参数,而且根据腾讯的语音识别API文档,这个还是必填的参数,那怎么办? 腾讯是以回调的方式返回识别结果的,而不是像讯飞那样以轮询的方式查看是否有识别结果返回,所以得自己搭建服务处理腾讯回调的POST请求。当然,解决这个问题最简单的方法就是,你可以用 flask 写一个处理回调的路由函数,我把我自己做的数据解析放这儿了,如果觉得有用请自取:
import jsonfrom flask import Flask, requestapp = Flask(__name__)@app.route('/data', methods=['POST'])def testpost(): if request.method == 'POST': # print('腾讯响应了') print(request.form) if request.form['message'] == '成功': filename = request.form['audioUrl'].split('/')[-1].split('.')[0] # 获取文件名 txt_file = filename + ".txt" # 转写结果保存的TXT文件 doc = open(txt_file, 'w', encoding='utf-8') recognition_text = request.form['text'] sentence_list = recognition_text.split('\n')[0:-1] # 列表最后一个元素是空字符串 for sentence in sentence_list: content = sentence.split(' ')[1] # 获取单句通话内容 begin_time = sentence.split(' ')[0].split(',')[0][1:] # 获取每句话的开始时间 begin_time = str(int(begin_time.split(":")[0]) * 60000 + int(begin_time.split(":")[1].replace(".", ""))) end_time = sentence.split(' ')[0].split(',')[1] # 获取每句话的结束时间 end_time = str(int(end_time.split(":")[0]) * 60000 + int(end_time.split(":")[1].replace(".", ""))) speaker = sentence.split(' ')[0].split(',')[-1][:-1] # 获取说话人 print(speaker + "\t" + content + '\t' + filename + '\t' + begin_time + '\t' + end_time) print(speaker + "\t" + content + '\t' + filename + '\t' + begin_time + '\t' + end_time, file=doc) doc.close() dict = { "code": 0, "message": "成功" } else: dict = { "code": 1, "message": "失败" } print(dict) return json.dumps(dict)if __name__ == '__main__': app.run(host="0.0.0.0", port=9979, threaded=True)
当然,如果你有公网可访问的 IP 地址或者域名的话,在Linux上执行上面这段程序:
然后修改CALLBACK_URL 的值为CALLBACK_URL = 'http://你的公网IP:9979/data' ,然后用 Python3 运行 OfflineCLient.py ,过一会儿你就可以在 Linux 这边收到录音识别的结果了。
如果没有公网可访问的 IP 或者域名,请参考: 进行配置。
如果你想上传本地的语音文件进行识别,请参考下面的demo:
# -*- coding: utf-8 -*-"""@author: Looking@email: 2392863668@qq.com本地语音文件识别"""import osimport requestsimport hashlibimport timeimport hmacimport base64import urllibimport urllib.parseimport jsonimport base64import Configimport randomfrom tencentcloud.common import credentialfrom tencentcloud.common.profile.client_profile import ClientProfilefrom tencentcloud.common.profile.http_profile import HttpProfilefrom tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKExceptionfrom tencentcloud.asr.v20190614 import asr_client, modelsappid = Config.config.APPIDreq_url = "https://aai.qcloud.com/asr/v1/"callback_url = ""sign_url = "aai.qcloud.com/asr/v1/"secret_id = Config.config.SECRETIDsecret_key = Config.config.SECRET_KEYdef task_process(audio_url): request_data = dict() request_data['channel_num'] = 1 request_data['secretid'] = secret_id request_data['engine_model_type'] = "8k_6" request_data['timestamp'] = int(time.time()) request_data['expired'] = int(time.time()) + 3600 request_data['nonce'] = 1559 request_data['projectid'] = 0 request_data['callback_url'] = callback_url request_data['res_text_format'] = 0 request_data['res_type'] = 1 request_data['source_type'] = 1 request_data['sub_service_type'] = 0 with open(audio_url, 'rb') as f: body_data = f.read() body_len = str(len(body_data)) authorization = generate_sign(request_data, appid) task_req_url = generate_request(request_data, appid) header = { "Authorization": authorization, "Content-Length": body_len } r = requests.post(task_req_url, headers=header, data=body_data) # print(task_req_url) # print(r.text) return r.textdef generate_sign(request_data, appid): sign_str = "POST" + sign_url + str(appid) + "?" sort_dict = sorted(request_data.keys()) for key in sort_dict: sign_str = sign_str + key + "=" + urllib.parse.unquote(str(request_data[key])) + '&' sign_str = sign_str[:-1] authorization = base64.b64encode( hmac.new(bytes(Config.config.SECRET_KEY, 'utf-8'), bytes(sign_str, 'utf-8'), hashlib.sha1).digest()) # authorization = base64.b64encode(hmac.new(secret_key, sign_str, hashlib.sha1).digest()) return authorizationdef generate_request(request_data, appid): result_url = req_url + str(appid) + "?" for key in request_data: result_url = result_url + key + "=" + str(request_data[key]) + '&' result_url = result_url[:-1] return result_urldef get_requestId(audio_file_path): request_result = task_process(audio_file_path) print(request_result) requestId = eval(request_result)["requestId"] return requestIddef get_recognition_result(requestId): try: cred = credential.Credential(Config.config.SECRETID, Config.config.SECRET_KEY) httpProfile = HttpProfile() httpProfile.endpoint = "asr.tencentcloudapi.com" clientProfile = ClientProfile() clientProfile.httpProfile = httpProfile client = asr_client.AsrClient(cred, "ap-guangzhou", clientProfile) while True: req = models.DescribeTaskStatusRequest() # 537731632 params = '{"TaskId":%s}' % requestId req.from_json_string(params) resp = client.DescribeTaskStatus(req) recognition_text = json.loads(resp.to_json_string()) recognition_status = recognition_text['Data']['StatusStr'] if recognition_status == "success": print(recognition_text['Data']['TaskId'], "识别成功!") break if recognition_status == "failed": raise TencentCloudSDKException time.sleep(1) # print(recognition_text) recognition_text = recognition_text['Data']['Result'] sentence_list = recognition_text.split('\n')[0:-1] # 列表最后一个元素是空字符串 for sentence in sentence_list: content = sentence.split(' ')[1] # 获取单句通话内容 begin_time = sentence.split(' ')[0].split(',')[0][1:] # 获取每句话的开始时间 begin_time = str(int(begin_time.split(":")[0]) * 60000 + int(begin_time.split(":")[1].replace(".", ""))) end_time = sentence.split(' ')[0].split(',')[1] # 获取每句话的结束时间 end_time = str(int(end_time.split(":")[0]) * 60000 + int(end_time.split(":")[1].replace(".", ""))) speaker = sentence.split(' ')[0].split(',')[-1][:-1] # 获取说话人 print(speaker + "\t" + content + '\t' + begin_time + '\t' + end_time) # print(speaker + "\t" + content + '\t' + filename + '\t' + begin_time + '\t' + end_time, file=doc) except TencentCloudSDKException as err: print(err)if __name__ == '__main__': audio_file_path = r"D:\MyProject\Python\audio_recognition\audio\o2020031309513910300127.wav" requestId = get_requestId(audio_file_path) get_recognition_result(requestId)
通过回调返回的结果除了没有 audioUrl 参数之外,其他部分与直接使用录音的 url 返回的数据是一样的。
转载地址:http://objqi.baihongyu.com/