關閉
標題:qwen3-tts
內容:
import torch
print(torch.cuda.is_available())
print(torch.version.cuda)
print(torch.cuda.get_device_name(0))
import soundfile as sf
from qwen_tts import Qwen3TTSModel
from flask import Flask, request, jsonify, send_file
import io
app = Flask(__name__)
model = Qwen3TTSModel.from_pretrained(
"Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
#device_map="cuda:0",
device_map="cpu",
#dtype=torch.float16,
dtype=torch.float32,
#attn_implementation="flash_attention_2",
)
# single inference
#wavs, sr = model.generate_custom_voice(
# #text="你各位小心嘍,現在覺得世間都充滿欠揍的屁孩,小山一醒來就揍人",
# language="Chinese", # Pass `Auto` (or omit) for auto language adaptive; if the target language is known, set it explicitly.
# speaker="Vivian",
# instruct="有點口急,且吞了一下口水的說", # Omit if not needed.
#)
#sf.write("output_custom_voice.wav", wavs[0], sr)
# -------------------------------
# Flask API:單句生成
# -------------------------------
@app.route("/generate_voice", methods=["POST"])
def generate_voice():
text = request.form.get("text", "")
speaker = request.form.get("speaker", "Vivian")
language = request.form.get("language", "Chinese")
instruct = request.form.get("instruct", "用平靜心情很好的口氣說")
if not text:
return jsonify({"error": "No text provided"}), 400
# 生成語音
wavs, sr = model.generate_custom_voice(
text=text,
speaker=speaker,
language=language,
instruct=instruct
)
# 將第一條音訊轉成 WAV Bytes
buffer = io.BytesIO()
sf.write(buffer, wavs[0], sr, format="WAV")
buffer.seek(0)
return send_file(
buffer,
mimetype="audio/wav",
as_attachment=True,
download_name="output.wav"
)
# -------------------------------
# 啟動 Flask
# -------------------------------
if __name__ == "__main__":
app.run(host="0.0.0.0", port=5000)