Vosk 是一款开源、离线的语音转文字工具,支持多语言和实时转写,核心优势是无需网络、低延迟且部署灵活。
核心特点
完全离线运行,不依赖云端服务,保护数据隐私。支持 20+ 语言(含中文),提供预训练模型,可直接使用。轻量高效,适配桌面(Windows/Mac/Linux)、移动端(Android/iOS)和嵌入式设备。支持实时流式转写和批量音频文件处理,准确率表现稳定。
适用场景
离线环境下的语音笔记、会议记录转写。嵌入式设备(如智能音箱、机器人)的语音交互。需保护数据隐私的企业级语音转写需求。开发自定义语音转文字应用(提供 Python/Java/C++ 等多语言 API)。
基础使用步骤
下载对应语言的预训练模型(中文模型约 1GB,轻量版更小)。安装 Vosk 核心库(Python 可通过 pip install vosk 快速安装)。导入库并加载模型,读取音频文件或实时捕获麦克风输入。调用转写接口,获取文字结果(支持逐句输出或完整文本)。
conda activate vost
conda install pip
pip install --upgrade pip
安装vosk
pip install vosk
安装sounddevice
pip install sounddevice
模型下载:https://alphacephei.com/vosk/models
访问页面:在浏览器中输入http://localhost:8000/index.html
chrome://flags/#unsafely-treat-insecure-origin-as-secure
http://192.168.3.122:8000/socket.html
dependencies
python 3.8.10vosk vosk-model-cn-0.22sounddevicewebsocket
模型准备
pretrained_model,已存放在model文件夹下
测试
cd 进入包含页面HTML文件的目录
cd . emplates
使用 Python 的 http.server 模块
python -m http.server 8000浏览器麦克风需要https安全策略,localhost忽略,
开发环境可在chrom浏览器配置chrome://flags/#unsafely-treat-insecure-origin-as-secure将http://192.168.3.122:8000加入安全网站,重启浏览器在浏览器中输入 本地http://localhost:8000/socket.html
或者 http://192.168.3.122:8000/socket.html
VoiceRecognitionServer.py
import asyncio
import json
import os
import uuid
import websockets
from vosk import Model, KaldiRecognizer, SetLogLevel
from typing import Set, Dict
# 配置日志级别
SetLogLevel(-1) # 禁用Vosk日志
class VoiceRecognitionServer:
def __init__(self):
self.host = "0.0.0.0"
self.port = 5678
self.sample_rate = 16000
self.max_connections = 100
self.active_connections: Set[websockets.WebSocketServerProtocol] = set()
self.client_recognizers: Dict[str, KaldiRecognizer] = {}
self.model = self.load_model()
def load_model(self):
model_path = "../model"
if not os.path.exists(model_path):
print(f"[ERROR] 请从 https://alphacephei.com/vosk/models 下载模型并解压到 {model_path} 文件夹")
exit(1)
print("[INFO] 正在加载语音识别模型...")
model = Model(model_path)
print("[INFO] 模型加载完成")
return model
async def handle_connection(self, websocket, path):
client_id = str(uuid.uuid4())
print(f"[CONNECT] 新客户端连接: {client_id}")
if len(self.active_connections) >= self.max_connections:
await websocket.send(json.dumps({
"error": "server_busy",
"message": "服务器达到最大连接数"
}))
await websocket.close()
return
try:
self.active_connections.add(websocket)
recognizer = KaldiRecognizer(self.model, self.sample_rate)
recognizer.SetWords(True)
self.client_recognizers[client_id] = recognizer
await websocket.send(json.dumps({
"status": "connected",
"client_id": client_id,
"sample_rate": self.sample_rate
}))
async for message in websocket:
await self.process_message(websocket, client_id, message)
except websockets.ConnectionClosed:
print(f"[DISCONNECT] 客户端断开: {client_id}")
except Exception as e:
print(f"[ERROR] 客户端 {client_id} 处理异常: {str(e)}")
finally:
self.cleanup_client(client_id, websocket)
async def process_message(self, websocket, client_id, message):
try:
if isinstance(message, str):
if message == "ping":
await websocket.send("pong")
return
recognizer = self.client_recognizers.get(client_id)
if not recognizer:
return
# 处理16位PCM数据 (小端序)
if len(message) % 2 != 0:
print(f"[WARN] 客户端 {client_id} 发送了异常长度的音频数据")
return
if recognizer.AcceptWaveform(message):
result = json.loads(recognizer.Result())
await websocket.send(json.dumps({
"text": result.get("text", ""),
"final": True,
"client_id": client_id
}))
else:
partial = json.loads(recognizer.PartialResult())
await websocket.send(json.dumps({
"partial": partial.get("partial", ""),
"final": False,
"client_id": client_id
}))
except Exception as e:
print(f"[ERROR] 处理客户端 {client_id} 消息失败: {str(e)}")
await websocket.send(json.dumps({
"error": "processing_error",
"message": str(e)
}))
def cleanup_client(self, client_id, websocket):
if websocket in self.active_connections:
self.active_connections.remove(websocket)
if client_id in self.client_recognizers:
del self.client_recognizers[client_id]
print(f"[CLEANUP] 已清理客户端资源: {client_id}")
async def run_server(self):
print(f"[SERVER] 启动语音识别服务器 {self.host}:{self.port}")
async with websockets.serve(
self.handle_connection,
self.host,
self.port,
ping_interval=20,
ping_timeout=60,
max_size=2 ** 20 # 1MB
):
print(f"[SERVER] 服务器已就绪,等待连接...")
await asyncio.Future() # 永久运行
if __name__ == "__main__":
try:
server = VoiceRecognitionServer()
asyncio.run(server.run_server())
except KeyboardInterrupt:
print("[SERVER] 服务器正常关闭")
except Exception as e:
print(f"[CRITICAL] 服务器崩溃: {str(e)}")
socket.html
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>语音实时转文字</title>
<link rel="shortcut icon"href="./ai.ico"type="image/x-icon">
<!-- <script src="https://cdn.tailwindcss.com"></script>-->
<!-- <link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.7.2/css/all.min.css">
<!-- <script src="https://cdn.jsdelivr.net/gh/mattdiamond/Recorderjs@master/dist/recorder.js"></script>-->
<script src="./js/tailwind.min.js"></script>
<link href="./css/all.min.css" rel="stylesheet">
<script src="./js/recorder.js"></script>
<style>
@font-face {
font-family: 'Font Awesome 6 Free';
font-style: normal;
font-weight: 900;
font-display: block;
src: url("./fonts/fa-solid-900.woff2") format("woff2");
}
@font-face {
font-family: 'Font Awesome 6 Free';
font-style: normal;
font-weight: 400;
font-display: block;
src: url("./fonts/fa-regular-400.woff2") format("woff2");
}
</style>
</head>
<body class="bg-gray-100 flex justify-center items-center h-screen">
<div class="bg-white p-8 rounded shadow-md w-full max-w-md">
<h1 class="text-2xl font-bold mb-4">语音实时转文字</h1>
<div class="mb-4">
<label class="block text-sm font-medium mb-1">服务器地址</label>
<input id="serverUrl" type="text" value="ws://192.168.3.122:5678"
class="w-full p-2 border rounded">
</div>
<button id="toggleButton" class="bg-blue-500 text-white py-2 px-4 rounded hover:bg-blue-600 focus:outline-none">
<i class="fa-solid fa-microphone"></i> 开始录音
</button>
<div class="mt-4 p-4 bg-gray-100 rounded">
<div id="result" class="min-h-40 max-h-60 overflow-y-auto mb-2 p-2 bg-white rounded">
<div
id="editableText"
class="w-full h-full outline-none overflow-y-auto"
contenteditable="true"
spellcheck="false"
placeholder="准备就绪,点击上方按钮开始录音..."
></div>
</div>
<div class="flex justify-between">
<button id="clearButton" class="text-sm text-gray-500 hover:text-gray-700">
<i class="fa-solid fa-trash-can"></i> 清空结果
</button>
<span id="status" class="text-sm text-gray-500">未连接</span>
</div>
</div>
</div>
<script>
const toggleButton = document.getElementById('toggleButton');
const clearButton = document.getElementById('clearButton');
const resultDiv = document.getElementById('result');
const serverUrlInput = document.getElementById('serverUrl');
const statusSpan = document.getElementById('status');
const editableText = document.getElementById('editableText');
let recorder;
let audioContext;
let socket;
let audioStream;
let isRecording = false;
let audioBuffer = [];
let resampleRatio = 3; // 48kHz → 16kHz
// 更新状态显示
function updateStatus(message, isError = false) {
statusSpan.textContent = message;
statusSpan.className = `text-sm ${isError ? 'text-red-500' : 'text-green-500'}`;
}
// 修改结果显示函数
function appendResult(text, isFinal) {
if (isFinal) {
// 移除之前的临时结果
const tempResults = document.querySelectorAll('.temp-result');
tempResults.forEach(el => el.remove());
if (text.trim() === '发送') {
send();
} else if (text.trim() === '清空' || text.trim() === '清空结果') {
clearButton.click();
}else{
editableText.textContent += text;
}
<!-- const p = document.createElement('p');-->
<!-- p.className = "mb-1 text-black final-result";-->
<!-- p.textContent = text;-->
<!-- resultDiv.appendChild(p);-->
}
resultDiv.scrollTop = resultDiv.scrollHeight;
}
function updatePartialResult(text) {
let lastLine = resultDiv.lastElementChild;
if (!lastLine || lastLine.classList.contains('final-result')) {
<!-- lastLine = document.createElement('p');-->
<!-- lastLine.className = "mb-1 text-gray-500 temp-result";-->
<!-- resultDiv.appendChild(lastLine);-->
editableText.textContent += lastLine;
}
<!-- lastLine.textContent = text;-->
}
// 模拟发送方法
function send() {
if(editableText.textContent){
alert("模拟发送:"+editableText.textContent);
}else{
alert("没有需要发送的内容@_@!");
}
}
// 连接WebSocket
async function connectWebSocket() {
return new Promise((resolve, reject) => {
updateStatus("连接中...");
socket = new WebSocket(serverUrlInput.value);
socket.binaryType = 'arraybuffer';
socket.onopen = () => {
updateStatus("已连接");
resolve();
};
socket.onerror = (error) => {
updateStatus("连接错误", true);
reject(error);
};
socket.onclose = () => {
if (isRecording) {
updateStatus("连接意外断开", true);
stopRecording();
} else {
updateStatus("已断开");
}
};
// 修改消息处理逻辑
let lastFinalResult = ""; // 保存上一次的最终结果
socket.onmessage = (event) => {
try {
const data = JSON.parse(event.data);
<!-- if (data.text) {-->
<!-- appendResult(data.text, true);-->
<!-- } else if (data.partial) {-->
<!-- appendResult(data.partial, false);-->
<!-- }-->
// 只处理新的最终结果
if (data.final && data.text && data.text !== lastFinalResult) {
appendResult(data.text, true);
lastFinalResult = data.text;
} else if (!data.final && data.partial) { // 临时结果处理
updatePartialResult(data.partial);
}
} catch (e) {
console.error("消息解析错误:", e);
}
};
});
}
// 开始录音
async function startRecording() {
try {
// 1. 获取麦克风权限
audioStream = await navigator.mediaDevices.getUserMedia({
audio: {
channelCount: 1,
echoCancellation: false,
noiseSuppression: false,
autoGainControl: false
},
video: false
});
// 2. 创建AudioContext
audioContext = new (window.AudioContext || window.webkitAudioContext)();
const source = audioContext.createMediaStreamSource(audioStream);
// 3. 创建重采样处理器
const processor = audioContext.createScriptProcessor(4096, 1, 1);
processor.onaudioprocess = (e) => {
const input = e.inputBuffer.getChannelData(0);
// 简单重采样 (48kHz → 16kHz)
for (let i = 0; i < input.length; i += resampleRatio) {
const val = input[Math.floor(i)] * 32767; // 转为16位PCM
audioBuffer.push(val > 32767 ? 32767 : val < -32768 ? -32768 : val);
}
// 每1600个采样点(100ms)发送一次
if (audioBuffer.length >= 1600 && socket.readyState === WebSocket.OPEN) {
const chunk = new Int16Array(audioBuffer.slice(0, 1600));
socket.send(chunk.buffer);
audioBuffer = audioBuffer.slice(1600);
}
// 静音输出
const output = e.outputBuffer.getChannelData(0);
for (let i = 0; i < output.length; i++) {
output[i] = 0;
}
};
source.connect(processor);
processor.connect(audioContext.destination);
// 4. 初始化Recorder(保持兼容)
recorder = new Recorder(source, {
numChannels: 1,
sampleRate: audioContext.sampleRate
});
recorder.record();
// 5. 连接WebSocket
await connectWebSocket();
// 6. 更新UI
toggleButton.innerHTML = '<i class="fa-solid fa-microphone-slash"></i> 停止录音';
toggleButton.classList.remove('bg-blue-500', 'hover:bg-blue-600');
toggleButton.classList.add('bg-red-500', 'hover:bg-red-600');
isRecording = true;
<!-- appendResult("录音已开始 (采样率: " + audioContext.sampleRate + "Hz)", true);-->
} catch (error) {
console.error("录音启动失败:", error);
updateStatus(`错误: ${error.message}`, true);
stopRecording();
}
}
// 停止录音
function stopRecording() {
if (recorder) {
recorder.stop();
recorder.clear();
}
if (audioStream) {
audioStream.getTracks().forEach(track => track.stop());
}
if (audioContext) {
audioContext.close();
}
if (socket && socket.readyState === WebSocket.OPEN) {
socket.close();
}
toggleButton.innerHTML = '<i class="fa-solid fa-microphone"></i> 开始录音';
toggleButton.classList.remove('bg-red-500', 'hover:bg-red-600');
toggleButton.classList.add('bg-blue-500', 'hover:bg-blue-600');
isRecording = false;
audioBuffer = [];
<!-- appendResult("录音已停止", true);-->
}
// 事件监听
toggleButton.addEventListener('click', () => {
if (isRecording) {
stopRecording();
} else {
startRecording();
}
});
clearButton.addEventListener('click', () => {
editableText.textContent = '';
lastFinalText = '';
});
</script>
</body>
</html>


