From 0fd8170c5bbb50beac026587f75f89357e801d7f Mon Sep 17 00:00:00 2001 From: bladeclara42 <71927457+bladeclara42@users.noreply.github.com> Date: Thu, 3 Jul 2025 10:02:12 +0700 Subject: [PATCH] feat: add voice transcription --- app/api/v1/voice.py | 10 +++++-- ...y => openai_voice_transcription_client.py} | 30 ++++++++++++++++++- app/main.py | 2 +- app/models/voice.py | 13 ++++++-- app/services/voice.py | 5 +++- 5 files changed, 53 insertions(+), 7 deletions(-) rename app/core/{openai_voice_client.py => openai_voice_transcription_client.py} (55%) diff --git a/app/api/v1/voice.py b/app/api/v1/voice.py index 67bdede..cf94768 100644 --- a/app/api/v1/voice.py +++ b/app/api/v1/voice.py @@ -1,6 +1,6 @@ from fastapi import APIRouter -from app.models.voice import VoiceRequest, VoiceResponse -from app.services.voice import generate_voice +from app.models.voice import VoiceRequest, VoiceResponse, TranscriptionRequest, TranscriptionResponse +from app.services.voice import generate_voice, generate_transcription router = APIRouter() @@ -8,3 +8,9 @@ router = APIRouter() async def voice(request: VoiceRequest): voice = await generate_voice(request.text) return VoiceResponse(voice=voice) + +@router.post("/transcription", response_model=TranscriptionResponse) +async def transcription(request: TranscriptionRequest): + transcription = await generate_transcription(request.audio_file_path) + return TranscriptionResponse(transcription=transcription) + diff --git a/app/core/openai_voice_client.py b/app/core/openai_voice_transcription_client.py similarity index 55% rename from app/core/openai_voice_client.py rename to app/core/openai_voice_transcription_client.py index 820f9fa..d50cc27 100644 --- a/app/core/openai_voice_client.py +++ b/app/core/openai_voice_transcription_client.py @@ -20,8 +20,36 @@ async def generate_voice(messages: list): try: response = client.chat.completions.create( model=OPENAI_AUDIO_MODEL, - response_format="mp3", messages=messages, + max_tokens=1000, + temperature=0.7, + stream=False + ) + + if not response.choices or not response.choices[0].message.content: + return "No response content from the model" + + return response.choices[0].message.content + + except OpenAIError as e: + error_msg = f"OpenAI API Error: {str(e)}" + print(error_msg) + raise Exception(error_msg) from e + except Exception as e: + error_msg = f"Unexpected error: {str(e)}" + print(error_msg) + raise Exception(error_msg) from e + +async def generate_transcription(audio_file_path: str) -> str: + if not audio_file_path: + raise ValueError("Audio file path cannot be empty") + + try: + response = client.audio.transcriptions.create( + model=OPENAI_AUDIO_MODEL, + file=audio_file_path, + response_format="text", + language="id" ) if not response.choices or not response.choices[0].message.content: diff --git a/app/main.py b/app/main.py index 9f77a86..7345442 100644 --- a/app/main.py +++ b/app/main.py @@ -6,4 +6,4 @@ app = FastAPI() # Include your routes app.include_router(translate.router, prefix="/api/v1/translate", tags=["translate"]) -app.include_router(voice.router, prefix="/api/v1/voice", tags=["voice"]) \ No newline at end of file +app.include_router(voice.router, prefix="/api/v1/voice", tags=["voice"]) diff --git a/app/models/voice.py b/app/models/voice.py index 0efb19d..5daadcc 100644 --- a/app/models/voice.py +++ b/app/models/voice.py @@ -1,8 +1,17 @@ from pydantic import BaseModel +from typing import Optional - +# Text-to-Speech Models class VoiceRequest(BaseModel): text: str class VoiceResponse(BaseModel): - voice: str + voice_output: str + +# Speech-to-Text Models +class TranscriptionRequest(BaseModel): + audio_file_path: str + target_language: Optional[str] = "id" # Default to English + +class TranscriptionResponse(BaseModel): + text: str diff --git a/app/services/voice.py b/app/services/voice.py index 87a64db..772e0a0 100644 --- a/app/services/voice.py +++ b/app/services/voice.py @@ -1,7 +1,10 @@ -from app.core.openai_voice_client import generate_voice +from app.core.openai_voice_transcription_client import generate_voice, generate_transcription async def generate_voice(text: str) -> str: voice = await generate_voice(text) return voice +async def generate_transcription(audio_file_path: str) -> str: + transcription = await generate_transcription(audio_file_path) + return transcription