python DeepSpeech 模块学习

DeepSpeech是由Mozilla开发的一款开源语音识别引擎，基于深度学习技术，特别是卷积神经网络（CNN）和循环神经网络（RNN）的强大能力，DeepSpeech在语音识别领域表现出色。其核心特点包括：
高准确率：通过大量的训练数据和先进的模型架构，DeepSpeech在多种语言和口音上均表现出较高的识别准确率。
低延迟：适用于实时语音识别场景，能够快速响应输入的语音信号。
开源免费：完全开源的代码和模型，用户可以根据自己的需求进行定制和优化。

在开始构建语音识别系统之前，我们需要准备相应的开发环境。以下是所需的软件和库：
1.Python：推荐使用Python 3.5-3.9 64位版本，确保兼容性。
2.DeepSpeech库：可以通过pip安装最新版本的DeepSpeech。
3.NumPy、SciPy：用于数值计算和信号处理。
4.PyAudio：用于实时音频流的捕获和处理。

安装DeepSpeech:
pip install deepspeech

下载模型和评分文件：
官方GitHub仓库：https://github.com/mozilla/DeepSpeech/releases
一条瑾瑜的小站：https://www.jinyuttsrz.top/index.php/archives/89/

1.使用DeepSpeech在录音后转文字

import pyaudio
import numpy as np
from deepspeech import Model

def capture_audio(duration=5, sample_rate=16000):
    print("开始录音")
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16, channels=1, rate=sample_rate, input=True, frames_per_buffer=1024)

    frames = []
    for _ in range(0, int(sample_rate / 1024 * duration)):
        data = stream.read(1024)
        frames.append(data)

    stream.stop_stream()
    stream.close()
    p.terminate()

    return b''.join(frames)


def preprocess_audio(audio_data):
    audio_array = np.frombuffer(audio_data, dtype=np.int16)
    return audio_array


def load_model(model_path='deepspeech-0.9.3-models.pbmm', scorer_path='deepspeech-0.9.3-models.scorer'):
    model = Model(model_path)
    model.enableExternalScorer(scorer_path)
    return model

def speech_to_text(model, audio_array):
    text = model.stt(audio_array)
    return text

def main():
    model = load_model()
    audio_data = capture_audio(duration=5)
    processed_audio = preprocess_audio(audio_data)
    text = speech_to_text(model, processed_audio)
    print(f"识别结果：{text}")

if __name__ == "__main__":
    main()

2.使用DeepSpeech读取音频后转文字

import os
import wave
import numpy as np
from deepspeech import Model

script_dir = os.path.dirname(os.path.abspath(__file__))

def load_audio_file(file_path):
    """从音频文件加载音频数据"""
    print(f"正在读取音频文件: {file_path}")


    if not os.path.exists(file_path):
        raise FileNotFoundError(f"音频文件不存在: {file_path}")

    # 使用wave模块读取WAV文件
    with wave.open(file_path, 'rb') as wf:
        # 获取音频参数
        sample_width = wf.getsampwidth()
        num_channels = wf.getnchannels()
        sample_rate = wf.getframerate()
        num_frames = wf.getnframes()

        print(f"音频信息: {num_channels}声道, 采样率{sample_rate}Hz, 采样宽度{sample_width}字节, 帧数{num_frames}")

        # 读取音频数据
        audio_data = wf.readframes(num_frames)

        audio_array = np.frombuffer(audio_data, dtype=np.int16)

        return audio_array.tobytes()

def preprocess_audio(audio_data):
    """预处理音频数据"""
    if audio_data is None:
        return None

    audio_array = np.frombuffer(audio_data, dtype=np.int16)
    print(f"音频数据长度: {len(audio_array)} 样本")

    # 归一化处理
    audio_array = audio_array.astype(np.float32) / 32768.0
    return audio_array


def load_model():
    # 使用中文模型文件
    model_path = os.path.join(script_dir, 'deepspeech-0.9.3-models.pbmm')
    scorer_path = os.path.join(script_dir, 'deepspeech-0.9.3-models.scorer')

    print(f"加载模型: {model_path}")
    print(f"加载评分器: {scorer_path}")

    model = Model(model_path)
    model.enableExternalScorer(scorer_path)
    return model


def speech_to_text(model, audio_array):
    text = model.stt(audio_array)
    return text


def main():
    model = load_model()
    audio_file_path = os.path.join(script_dir, "eng1.wav")
    audio_data = load_audio_file(file_path=audio_file_path)
    processed_audio = preprocess_audio(audio_data)

    if processed_audio is not None:
        # 从float32转回int16格式
        audio_int16 = (processed_audio * 32768.0).astype(np.int16)
        text = speech_to_text(model, audio_int16)
        # 处理可能的Unicode编码问题
        text = text.encode('utf-8', 'replace').decode('utf-8')
        print(f"识别结果：{text}")
    else:
        print("无法处理音频数据")


if __name__ == "__main__":
    main()

搜索一下

一条瑾瑜的小站

jinyu · 2025-10-24 · 14浏览 · python

1.使用DeepSpeech在录音后转文字

2.使用DeepSpeech读取音频后转文字

评论区

添加新评论

咱快来抢个沙发吧！

search 搜索一下

label_outline 标签云

一条瑾瑜的小站

python DeepSpeech 模块学习

jinyu · 2025-10-24 · 14浏览 · python

1.使用DeepSpeech在录音后转文字

2.使用DeepSpeech读取音频后转文字

comment 评论区

添加新评论

star_outline 咱快来抢个沙发吧！

搜索一下

标签云

评论区

咱快来抢个沙发吧！