faster whisper从多媒体语音材料中抽取出文本-3
"""
批量转录当前目录下的 .mp3 文件,使用 faster-whisper
"""
import os
import logging
import sys
from pathlib import Path
from faster_whisper import WhisperModel
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('Extrat Text')
# ================== 配置区 ==================
MODEL_SIZE = "small" # 可选: tiny, base, small, medium, large
DEVICE = "cpu" # cpu 或 cuda
COMPUTE_TYPE = "int8" # int8, float16, float32 (CPU 推荐 int8)
VAD_FILTER = True # 启用语音活动检测,去除静音
OUTPUT_FORMAT = "txt" # 只输出 .txt
VERBOSE = True # 是否显示详细日志
# ===========================================
def transcribe_audio(audio_path: Path, model: WhisperModel) -> str:
"""转录单个音频文件,返回文本内容"""
print(f"转录: {audio_path.name} → {audio_path.stem}.txt")
segments, info = model.transcribe(
str(audio_path),
language=None, # 自动检测
beam_size=5,
vad_filter=VAD_FILTER,
vad_parameters=dict(min_silence_duration_ms=500),
word_timestamps=False,
)
text_lines = []
for segment in segments:
line = segment.text.strip()
text_lines.append(line)
if VERBOSE:
logger.info(".", end="", flush=True)
# print(f"[{segment.start:06.2f}s --> {segment.end:06.2f}s] {line}", flush=True)
return "n".join(text_lines)
def main():
print("=== faster-whisper 批量转录 ===")
current_dir = Path(".")
mp3_files = sorted(current_dir.glob("*.mp3"))
if not mp3_files:
print("未找到 .mp3 文件,退出。")
return
# 加载模型(只加载一次)
print(f"正在加载模型 {MODEL_SIZE} ({DEVICE}, {COMPUTE_TYPE})...")
model = WhisperModel(MODEL_SIZE, device=DEVICE, compute_type=COMPUTE_TYPE)
processed = 0
for mp3_path in mp3_files:
txt_path = mp3_path.with_suffix(".txt")
if txt_path.exists():
print(f"跳过: {txt_path.name} 已存在")
continue
try:
text = transcribe_audio(mp3_path, model)
txt_path.write_text(text, encoding="utf-8")
processed += 1
except Exception as e:
print(f"错误转录 {mp3_path.name}: {e}", file=sys.stderr)
print(f"全部完成!共处理 {processed} 个文件。")
if __name__ == "__main__":
main()