vosk语音转文字

Vosk 是一款开源、离线的语音转文字工具,支持多语言和实时转写,核心优势是无需网络、低延迟且部署灵活。

核心特点

完全离线运行,不依赖云端服务,保护数据隐私。支持 20+ 语言(含中文),提供预训练模型,可直接使用。轻量高效,适配桌面(Windows/Mac/Linux)、移动端(Android/iOS)和嵌入式设备。支持实时流式转写和批量音频文件处理,准确率表现稳定。

适用场景

离线环境下的语音笔记、会议记录转写。嵌入式设备(如智能音箱、机器人)的语音交互。需保护数据隐私的企业级语音转写需求。开发自定义语音转文字应用(提供 Python/Java/C++ 等多语言 API)。

基础使用步骤

下载对应语言的预训练模型(中文模型约 1GB,轻量版更小)。安装 Vosk 核心库(Python 可通过 pip install vosk 快速安装)。导入库并加载模型,读取音频文件或实时捕获麦克风输入。调用转写接口,获取文字结果(支持逐句输出或完整文本)。


conda activate vost

conda install pip

pip install --upgrade pip

安装vosk
pip install vosk

安装sounddevice
pip install sounddevice


模型下载:https://alphacephei.com/vosk/models

访问页面:在浏览器中输入http://localhost:8000/index.html

chrome://flags/#unsafely-treat-insecure-origin-as-secure

http://192.168.3.122:8000/socket.html

dependencies

python 3.8.10vosk vosk-model-cn-0.22sounddevicewebsocket

模型准备

pretrained_model,已存放在model文件夹下

测试

cd 进入包含页面HTML文件的目录
cd . emplates
使用 Python 的 http.server 模块
python -m http.server 8000浏览器麦克风需要https安全策略,localhost忽略,
开发环境可在chrom浏览器配置chrome://flags/#unsafely-treat-insecure-origin-as-secure将http://192.168.3.122:8000加入安全网站,重启浏览器在浏览器中输入 本地http://localhost:8000/socket.html
或者 http://192.168.3.122:8000/socket.html

VoiceRecognitionServer.py


import asyncio
import json
import os
import uuid
import websockets
from vosk import Model, KaldiRecognizer, SetLogLevel
from typing import Set, Dict

# 配置日志级别
SetLogLevel(-1)  # 禁用Vosk日志


class VoiceRecognitionServer:
    def __init__(self):
        self.host = "0.0.0.0"
        self.port = 5678
        self.sample_rate = 16000
        self.max_connections = 100
        self.active_connections: Set[websockets.WebSocketServerProtocol] = set()
        self.client_recognizers: Dict[str, KaldiRecognizer] = {}
        self.model = self.load_model()

    def load_model(self):
        model_path = "../model"
        if not os.path.exists(model_path):
            print(f"[ERROR] 请从 https://alphacephei.com/vosk/models 下载模型并解压到 {model_path} 文件夹")
            exit(1)

        print("[INFO] 正在加载语音识别模型...")
        model = Model(model_path)
        print("[INFO] 模型加载完成")
        return model

    async def handle_connection(self, websocket, path):
        client_id = str(uuid.uuid4())
        print(f"[CONNECT] 新客户端连接: {client_id}")

        if len(self.active_connections) >= self.max_connections:
            await websocket.send(json.dumps({
                "error": "server_busy",
                "message": "服务器达到最大连接数"
            }))
            await websocket.close()
            return

        try:
            self.active_connections.add(websocket)
            recognizer = KaldiRecognizer(self.model, self.sample_rate)
            recognizer.SetWords(True)
            self.client_recognizers[client_id] = recognizer

            await websocket.send(json.dumps({
                "status": "connected",
                "client_id": client_id,
                "sample_rate": self.sample_rate
            }))

            async for message in websocket:
                await self.process_message(websocket, client_id, message)

        except websockets.ConnectionClosed:
            print(f"[DISCONNECT] 客户端断开: {client_id}")
        except Exception as e:
            print(f"[ERROR] 客户端 {client_id} 处理异常: {str(e)}")
        finally:
            self.cleanup_client(client_id, websocket)

    async def process_message(self, websocket, client_id, message):
        try:
            if isinstance(message, str):
                if message == "ping":
                    await websocket.send("pong")
                return

            recognizer = self.client_recognizers.get(client_id)
            if not recognizer:
                return

            # 处理16位PCM数据 (小端序)
            if len(message) % 2 != 0:
                print(f"[WARN] 客户端 {client_id} 发送了异常长度的音频数据")
                return

            if recognizer.AcceptWaveform(message):
                result = json.loads(recognizer.Result())
                await websocket.send(json.dumps({
                    "text": result.get("text", ""),
                    "final": True,
                    "client_id": client_id
                }))
            else:
                partial = json.loads(recognizer.PartialResult())
                await websocket.send(json.dumps({
                    "partial": partial.get("partial", ""),
                    "final": False,
                    "client_id": client_id
                }))

        except Exception as e:
            print(f"[ERROR] 处理客户端 {client_id} 消息失败: {str(e)}")
            await websocket.send(json.dumps({
                "error": "processing_error",
                "message": str(e)
            }))

    def cleanup_client(self, client_id, websocket):
        if websocket in self.active_connections:
            self.active_connections.remove(websocket)
        if client_id in self.client_recognizers:
            del self.client_recognizers[client_id]
        print(f"[CLEANUP] 已清理客户端资源: {client_id}")

    async def run_server(self):
        print(f"[SERVER] 启动语音识别服务器 {self.host}:{self.port}")
        async with websockets.serve(
                self.handle_connection,
                self.host,
                self.port,
                ping_interval=20,
                ping_timeout=60,
                max_size=2 ** 20  # 1MB
        ):
            print(f"[SERVER] 服务器已就绪,等待连接...")
            await asyncio.Future()  # 永久运行


if __name__ == "__main__":
    try:
        server = VoiceRecognitionServer()
        asyncio.run(server.run_server())
    except KeyboardInterrupt:
        print("[SERVER] 服务器正常关闭")
    except Exception as e:
        print(f"[CRITICAL] 服务器崩溃: {str(e)}")

socket.html


<!DOCTYPE html>
<html lang="zh-CN">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>语音实时转文字</title>
     <link rel="shortcut icon"href="./ai.ico"type="image/x-icon">
<!--    <script src="https://cdn.tailwindcss.com"></script>-->
<!--    <link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.7.2/css/all.min.css">
<!--    <script src="https://cdn.jsdelivr.net/gh/mattdiamond/Recorderjs@master/dist/recorder.js"></script>-->

     <script src="./js/tailwind.min.js"></script>
    <link href="./css/all.min.css" rel="stylesheet">
    <script src="./js/recorder.js"></script>
    <style>
        @font-face {
            font-family: 'Font Awesome 6 Free';
            font-style: normal;
            font-weight: 900;
            font-display: block;
            src: url("./fonts/fa-solid-900.woff2") format("woff2");
        }
        @font-face {
            font-family: 'Font Awesome 6 Free';
            font-style: normal;
            font-weight: 400;
            font-display: block;
            src: url("./fonts/fa-regular-400.woff2") format("woff2");
        }
    </style>
</head>
<body class="bg-gray-100 flex justify-center items-center h-screen">
    <div class="bg-white p-8 rounded shadow-md w-full max-w-md">
        <h1 class="text-2xl font-bold mb-4">语音实时转文字</h1>
        <div class="mb-4">
            <label class="block text-sm font-medium mb-1">服务器地址</label>
            <input id="serverUrl" type="text" value="ws://192.168.3.122:5678"
                   class="w-full p-2 border rounded">
        </div>
        <button id="toggleButton" class="bg-blue-500 text-white py-2 px-4 rounded hover:bg-blue-600 focus:outline-none">
            <i class="fa-solid fa-microphone"></i> 开始录音
        </button>
        <div class="mt-4 p-4 bg-gray-100 rounded">
                <div id="result" class="min-h-40 max-h-60 overflow-y-auto mb-2 p-2 bg-white rounded">
                <div
                    id="editableText"
                    class="w-full h-full outline-none overflow-y-auto"
                    contenteditable="true"
                    spellcheck="false"
                    placeholder="准备就绪,点击上方按钮开始录音..."
                ></div>
            </div>
            <div class="flex justify-between">
                <button id="clearButton" class="text-sm text-gray-500 hover:text-gray-700">
                    <i class="fa-solid fa-trash-can"></i> 清空结果
                </button>
                <span id="status" class="text-sm text-gray-500">未连接</span>
            </div>
        </div>
    </div>

    <script>
        const toggleButton = document.getElementById('toggleButton');
        const clearButton = document.getElementById('clearButton');
        const resultDiv = document.getElementById('result');
        const serverUrlInput = document.getElementById('serverUrl');
        const statusSpan = document.getElementById('status');
        const editableText = document.getElementById('editableText');


        let recorder;
        let audioContext;
        let socket;
        let audioStream;
        let isRecording = false;
        let audioBuffer = [];
        let resampleRatio = 3; // 48kHz → 16kHz

        // 更新状态显示
        function updateStatus(message, isError = false) {
            statusSpan.textContent = message;
            statusSpan.className = `text-sm ${isError ? 'text-red-500' : 'text-green-500'}`;
        }


        // 修改结果显示函数
        function appendResult(text, isFinal) {
            if (isFinal) {
                // 移除之前的临时结果
                const tempResults = document.querySelectorAll('.temp-result');
                tempResults.forEach(el => el.remove());

                if (text.trim() === '发送') {
                    send();
                } else if (text.trim() === '清空' || text.trim() === '清空结果') {
                    clearButton.click();
                }else{
                  editableText.textContent += text;
                }



<!--                const p = document.createElement('p');-->
<!--                p.className = "mb-1 text-black final-result";-->
<!--                p.textContent = text;-->
<!--                resultDiv.appendChild(p);-->
            }
            resultDiv.scrollTop = resultDiv.scrollHeight;
        }

        function updatePartialResult(text) {
            let lastLine = resultDiv.lastElementChild;
            if (!lastLine || lastLine.classList.contains('final-result')) {
<!--                lastLine = document.createElement('p');-->
<!--                lastLine.className = "mb-1 text-gray-500 temp-result";-->
<!--                resultDiv.appendChild(lastLine);-->
                 editableText.textContent += lastLine;
            }
<!--            lastLine.textContent = text;-->
        }

          // 模拟发送方法
        function send() {
            if(editableText.textContent){
              alert("模拟发送:"+editableText.textContent);
            }else{
             alert("没有需要发送的内容@_@!");
            }

        }

        // 连接WebSocket
        async function connectWebSocket() {
            return new Promise((resolve, reject) => {
                updateStatus("连接中...");
                socket = new WebSocket(serverUrlInput.value);
                socket.binaryType = 'arraybuffer';

                socket.onopen = () => {
                    updateStatus("已连接");
                    resolve();
                };

                socket.onerror = (error) => {
                    updateStatus("连接错误", true);
                    reject(error);
                };

                socket.onclose = () => {
                    if (isRecording) {
                        updateStatus("连接意外断开", true);
                        stopRecording();
                    } else {
                        updateStatus("已断开");
                    }
                };

                // 修改消息处理逻辑
                let lastFinalResult = ""; // 保存上一次的最终结果

                socket.onmessage = (event) => {
                    try {
                        const data = JSON.parse(event.data);
<!--                        if (data.text) {-->
<!--                            appendResult(data.text, true);-->
<!--                        } else if (data.partial) {-->
<!--                            appendResult(data.partial, false);-->
<!--                        }-->

                          // 只处理新的最终结果
                        if (data.final && data.text && data.text !== lastFinalResult) {
                            appendResult(data.text, true);
                            lastFinalResult = data.text;
                        }  else if (!data.final && data.partial) { // 临时结果处理
                            updatePartialResult(data.partial);
                        }


                    } catch (e) {
                        console.error("消息解析错误:", e);
                    }
                };
            });
        }

        // 开始录音
        async function startRecording() {
            try {
                // 1. 获取麦克风权限
                audioStream = await navigator.mediaDevices.getUserMedia({
                    audio: {
                        channelCount: 1,
                        echoCancellation: false,
                        noiseSuppression: false,
                        autoGainControl: false
                    },
                    video: false
                });

                // 2. 创建AudioContext
                audioContext = new (window.AudioContext || window.webkitAudioContext)();
                const source = audioContext.createMediaStreamSource(audioStream);

                // 3. 创建重采样处理器
                const processor = audioContext.createScriptProcessor(4096, 1, 1);

                processor.onaudioprocess = (e) => {
                    const input = e.inputBuffer.getChannelData(0);

                    // 简单重采样 (48kHz → 16kHz)
                    for (let i = 0; i < input.length; i += resampleRatio) {
                        const val = input[Math.floor(i)] * 32767; // 转为16位PCM
                        audioBuffer.push(val > 32767 ? 32767 : val < -32768 ? -32768 : val);
                    }

                    // 每1600个采样点(100ms)发送一次
                    if (audioBuffer.length >= 1600 && socket.readyState === WebSocket.OPEN) {
                        const chunk = new Int16Array(audioBuffer.slice(0, 1600));
                        socket.send(chunk.buffer);
                        audioBuffer = audioBuffer.slice(1600);
                    }

                    // 静音输出
                    const output = e.outputBuffer.getChannelData(0);
                    for (let i = 0; i < output.length; i++) {
                        output[i] = 0;
                    }
                };

                source.connect(processor);
                processor.connect(audioContext.destination);

                // 4. 初始化Recorder(保持兼容)
                recorder = new Recorder(source, {
                    numChannels: 1,
                    sampleRate: audioContext.sampleRate
                });
                recorder.record();

                // 5. 连接WebSocket
                await connectWebSocket();

                // 6. 更新UI
                toggleButton.innerHTML = '<i class="fa-solid fa-microphone-slash"></i> 停止录音';
                toggleButton.classList.remove('bg-blue-500', 'hover:bg-blue-600');
                toggleButton.classList.add('bg-red-500', 'hover:bg-red-600');
                isRecording = true;

<!--                appendResult("录音已开始 (采样率: " + audioContext.sampleRate + "Hz)", true);-->

            } catch (error) {
                console.error("录音启动失败:", error);
                updateStatus(`错误: ${error.message}`, true);
                stopRecording();
            }
        }

        // 停止录音
        function stopRecording() {
            if (recorder) {
                recorder.stop();
                recorder.clear();
            }

            if (audioStream) {
                audioStream.getTracks().forEach(track => track.stop());
            }

            if (audioContext) {
                audioContext.close();
            }

            if (socket && socket.readyState === WebSocket.OPEN) {
                socket.close();
            }

            toggleButton.innerHTML = '<i class="fa-solid fa-microphone"></i> 开始录音';
            toggleButton.classList.remove('bg-red-500', 'hover:bg-red-600');
            toggleButton.classList.add('bg-blue-500', 'hover:bg-blue-600');
            isRecording = false;
            audioBuffer = [];

<!--            appendResult("录音已停止", true);-->
        }

        // 事件监听
        toggleButton.addEventListener('click', () => {
            if (isRecording) {
                stopRecording();
            } else {
                startRecording();
            }
        });


         clearButton.addEventListener('click', () => {
            editableText.textContent = '';
            lastFinalText = '';
        });
    </script>
</body>
</html>
© 版权声明

相关文章

暂无评论

您必须登录才能参与评论!
立即登录
none
暂无评论...