语音听写顾名思义,是把我们说的话转成文字,但是讯飞官网提供的是音频文件转文字,我们如何通过PYTHON调用麦克风把我们实时对着电脑说的话转成文字呢,那就需要麦克风传音频流的方式,这样我们可以实现指令识别、发音转文字等功能和结合实际业务使用。

# This is a sample Python script.
# 变量
import base64
import datetime
import hashlib
import hmac
import json
import ssl
import threading
import time
from builtins import str
from datetime import datetime
from time import mktime
from urllib.parse import urlencode
from wsgiref.handlers import format_date_time

import pyaudio
import websocket

host_url = "wss://ws-api.xfyun.cn/v2/iat"
appid = ""  # 控制台获取
api_secret = ""
api_key = ""
audio_file = "./1.pcm"
send_flag = True


def product_url(api_secret, api_key):
    now_time = datetime.now()
    now_date = format_date_time(mktime(now_time.timetuple()))
    # print(now_date)
    # 拼接鉴权原始餐宿
    # now_date = "Fri, 18 Oct 2024 07:39:19 GMT"
    origin_base = "host: " + "ws-api.xfyun.cn" + "\n"
    origin_base += "date: " + now_date + "\n"
    origin_base += "GET " + "/v2/iat " + "HTTP/1.1"
    # print(origin_base)
    # sha256加密
    signature_sha = hmac.new(api_secret.encode('utf-8'), origin_base.encode('utf-8'),
                             digestmod=hashlib.sha256).digest()
    signature_sha = base64.b64encode(signature_sha).decode(encoding='utf-8')
    print(signature_sha)
    authorization_origin = "api_key=\"%s\", algorithm=\"%s\", headers=\"%s\", signature=\"%s\"" % (
        api_key, "hmac-sha256", "host date request-line", signature_sha)
    authorization = base64.b64encode(authorization_origin.encode('utf-8')).decode(encoding='utf-8')
    print(authorization)
    # 将请求的鉴权参数组合为字典
    dict_data = {
        "authorization": authorization,
        "date": now_date,
        "host": "ws-api.xfyun.cn"
    }
    ws_url = host_url + '?' + urlencode(dict_data)
    # print(ws_url)
    return ws_url


def on_message(ws, message):
    print(f"Received message: {message}")
    status = json.loads(message)["data"]["status"]
    ws_list = json.loads(message)["data"]["result"]["ws"]
    mark = json.loads(message)["data"]["result"]["pgs"]
    res = ""
    for my_ws in ws_list:
        for w in my_ws["cw"]:
            res = res + w["w"]
    print(f"{res} ---标志:{mark}")
    # print(status)
    if status == 2:
        global send_flag
        send_flag = False
        ws.close()


def on_error(ws, error):
    print(f"Error: {error},{ws}")


def on_close(ws, reason, res):
    print(f"WebSocket connection closed,{ws}")


def on_open(ws):
    print(f"WebSocket connection opened,{ws},ws连接建立成功...")
    # 这里可以发送初始消息给服务器,如果需要的话
    first_dict = {
        "common": {
            "app_id": appid
        },
        "business": {
            "language": "zh_cn",
            "domain": "iat",
            "accent": "mandarin",
            "dwa": "wpgs"
        },
        "data": {
            "status": 0,
            "format": "audio/L16;rate=16000",
            "encoding": "raw",
            "audio": ""
        }
    }
    ws.send(json.dumps(first_dict))  # 发送第一帧


def close_connection(ws):
    print("Closing WebSocket connection...")
    ws.close()


# 主函数入口
if __name__ == '__main__':
    start_time = datetime.now()
    websocket.enableTrace(False)
    ws_url = product_url(api_secret, api_key)
    ws_entity = websocket.WebSocketApp(ws_url, on_message=on_message, on_error=on_error, on_close=on_close,
                                       on_open=on_open)
    ws_entity.run_forever(sslopt={"cert_reqs": ssl.CERT_NONE})
    end_time = datetime.now()
    print(f"听写耗时: {end_time - start_time}")

Logo

GitCode 天启AI是一款由 GitCode 团队打造的智能助手,基于先进的LLM(大语言模型)与多智能体 Agent 技术构建,致力于为用户提供高效、智能、多模态的创作与开发支持。它不仅支持自然语言对话,还具备处理文件、生成 PPT、撰写分析报告、开发 Web 应用等多项能力,真正做到“一句话,让 Al帮你完成复杂任务”。

更多推荐