關閉      標題:qwen3-tts
內容:

import torch
print(torch.cuda.is_available())
print(torch.version.cuda)
print(torch.cuda.get_device_name(0))
import soundfile as sf
from qwen_tts import Qwen3TTSModel
from flask import Flask, request, jsonify, send_file
import io

app = Flask(__name__)

model = Qwen3TTSModel.from_pretrained(
    "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
    #device_map="cuda:0",
    device_map="cpu",
    #dtype=torch.float16,
    dtype=torch.float32,
    #attn_implementation="flash_attention_2",
)

# single inference
#wavs, sr = model.generate_custom_voice(
#    #text="你各位小心嘍,現在覺得世間都充滿欠揍的屁孩,小山一醒來就揍人",
#    language="Chinese", # Pass `Auto` (or omit) for auto language adaptive; if the target language is known, set it explicitly.
#    speaker="Vivian",
#    instruct="有點口急,且吞了一下口水的說", # Omit if not needed.
#)
#sf.write("output_custom_voice.wav", wavs[0], sr)

# -------------------------------
# Flask API:單句生成
# -------------------------------
@app.route("/generate_voice", methods=["POST"])
def generate_voice():
    
    text = request.form.get("text", "")
    speaker = request.form.get("speaker", "Vivian")
    language = request.form.get("language", "Chinese")
    instruct = request.form.get("instruct", "用平靜心情很好的口氣說")


    if not text:
        return jsonify({"error": "No text provided"}), 400

    # 生成語音
    wavs, sr = model.generate_custom_voice(
        text=text,
        speaker=speaker,
        language=language,
        instruct=instruct
    )

    # 將第一條音訊轉成 WAV Bytes
    buffer = io.BytesIO()
    sf.write(buffer, wavs[0], sr, format="WAV")
    buffer.seek(0)

    return send_file(
        buffer,
        mimetype="audio/wav",
        as_attachment=True,
        download_name="output.wav"
    )

# -------------------------------
# 啟動 Flask
# -------------------------------
if __name__ == "__main__":
    app.run(host="0.0.0.0", port=5000)