DeepSpeech是由Mozilla开发的一款开源语音识别引擎,基于深度学习技术,特别是卷积神经网络(CNN)和循环神经网络(RNN)的强大能力,DeepSpeech在语音识别领域表现出色。其核心特点包括:
- 高准确率:通过大量的训练数据和先进的模型架构,DeepSpeech在多种语言和口音上均表现出较高的识别准确率。
- 低延迟:适用于实时语音识别场景,能够快速响应输入的语音信号。
- 开源免费:完全开源的代码和模型,用户可以根据自己的需求进行定制和优化。
在开始构建语音识别系统之前,我们需要准备相应的开发环境。以下是所需的软件和库:
1.Python:推荐使用Python 3.5-3.9 64位版本,确保兼容性。
2.DeepSpeech库:可以通过pip安装最新版本的DeepSpeech。
3.NumPy、SciPy:用于数值计算和信号处理。
4.PyAudio:用于实时音频流的捕获和处理。
安装DeepSpeech:
pip install deepspeech
下载模型和评分文件:
官方GitHub仓库:https://github.com/mozilla/DeepSpeech/releases
一条瑾瑜的小站:https://www.jinyuttsrz.top/index.php/archives/89/
1.使用DeepSpeech在录音后转文字
import pyaudio
import numpy as np
from deepspeech import Model
def capture_audio(duration=5, sample_rate=16000):
print("开始录音")
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16, channels=1, rate=sample_rate, input=True, frames_per_buffer=1024)
frames = []
for _ in range(0, int(sample_rate / 1024 * duration)):
data = stream.read(1024)
frames.append(data)
stream.stop_stream()
stream.close()
p.terminate()
return b''.join(frames)
def preprocess_audio(audio_data):
audio_array = np.frombuffer(audio_data, dtype=np.int16)
return audio_array
def load_model(model_path='deepspeech-0.9.3-models.pbmm', scorer_path='deepspeech-0.9.3-models.scorer'):
model = Model(model_path)
model.enableExternalScorer(scorer_path)
return model
def speech_to_text(model, audio_array):
text = model.stt(audio_array)
return text
def main():
model = load_model()
audio_data = capture_audio(duration=5)
processed_audio = preprocess_audio(audio_data)
text = speech_to_text(model, processed_audio)
print(f"识别结果:{text}")
if __name__ == "__main__":
main()
2.使用DeepSpeech读取音频后转文字
import os
import wave
import numpy as np
from deepspeech import Model
script_dir = os.path.dirname(os.path.abspath(__file__))
def load_audio_file(file_path):
"""从音频文件加载音频数据"""
print(f"正在读取音频文件: {file_path}")
if not os.path.exists(file_path):
raise FileNotFoundError(f"音频文件不存在: {file_path}")
# 使用wave模块读取WAV文件
with wave.open(file_path, 'rb') as wf:
# 获取音频参数
sample_width = wf.getsampwidth()
num_channels = wf.getnchannels()
sample_rate = wf.getframerate()
num_frames = wf.getnframes()
print(f"音频信息: {num_channels}声道, 采样率{sample_rate}Hz, 采样宽度{sample_width}字节, 帧数{num_frames}")
# 读取音频数据
audio_data = wf.readframes(num_frames)
audio_array = np.frombuffer(audio_data, dtype=np.int16)
return audio_array.tobytes()
def preprocess_audio(audio_data):
"""预处理音频数据"""
if audio_data is None:
return None
audio_array = np.frombuffer(audio_data, dtype=np.int16)
print(f"音频数据长度: {len(audio_array)} 样本")
# 归一化处理
audio_array = audio_array.astype(np.float32) / 32768.0
return audio_array
def load_model():
# 使用中文模型文件
model_path = os.path.join(script_dir, 'deepspeech-0.9.3-models.pbmm')
scorer_path = os.path.join(script_dir, 'deepspeech-0.9.3-models.scorer')
print(f"加载模型: {model_path}")
print(f"加载评分器: {scorer_path}")
model = Model(model_path)
model.enableExternalScorer(scorer_path)
return model
def speech_to_text(model, audio_array):
text = model.stt(audio_array)
return text
def main():
model = load_model()
audio_file_path = os.path.join(script_dir, "eng1.wav")
audio_data = load_audio_file(file_path=audio_file_path)
processed_audio = preprocess_audio(audio_data)
if processed_audio is not None:
# 从float32转回int16格式
audio_int16 = (processed_audio * 32768.0).astype(np.int16)
text = speech_to_text(model, audio_int16)
# 处理可能的Unicode编码问题
text = text.encode('utf-8', 'replace').decode('utf-8')
print(f"识别结果:{text}")
else:
print("无法处理音频数据")
if __name__ == "__main__":
main()
comment 评论区
star_outline 咱快来抢个沙发吧!