The Python client for BigQuery recently added a to_arrow() method that will return query results as an [Apache Arrow] Table
.
This means there is no need to serialize data or convert types on the server or the client, which can result in massive gains in terms of both performance and memory efficiency.
import pyarrow as pa
from google.cloud import bigquery
def arrow_table_to_pybytes(arrow_table):
sink = pa.BufferOutputStream()
writer = pa.ipc.new_stream(sink, arrow_table.schema)
for batch in arrow_table.to_batches():
writer.write_batch(batch)
writer.close()
buf = sink.getvalue()
return buf.to_pybytes()
def get_bq_arrow(request):
dataset = "hpwg-297320.movebank"
table = "wildebeest"
query = f"""
SELECT
individual_local_identifier, timestamp, location_long, location_lat,
FROM {dataset}.{table}
LIMIT 10000
"""
bq_client = bigquery.Client()
query_job = bq_client.query(query)
arrow_table = query_job.to_arrow()
return arrow_table_to_pybytes(arrow_table)
Since Arrow is a binary format, be sure to set a proper content type when sending it over HTTP. On the client you can pass the response to Arrow.Table.from()
, which is a zero copy operation and thus extremely performant.
from django.http import HttpResponse
...
return HttpResponse(
data_from_get_bq_arrow,
content_type='application/octet-stream',
)
from flask import Response
...
return Response(
data_from_get_bq_arrow,
mimetype='application/octet-stream',
)