123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429 |
- From 9e72968ae5f2e27c4f1d2b90337aa2d27c06cae4 Mon Sep 17 00:00:00 2001
- From: Varad Gautam <varadgautam@gmail.com>
- Date: Wed, 17 Feb 2016 19:08:21 +0530
- Subject: [PATCH 293/381] drm/vc4: improve throughput by pipelining binning and
- rendering jobs
- The hardware provides us with separate threads for binning and
- rendering, and the existing model waits for them both to complete
- before submitting the next job.
- Splitting the binning and rendering submissions reduces idle time and
- gives us approx 20-30% speedup with some x11perf tests such as -line10
- and -tilerect1. Improves openarena performance by 1.01897% +/-
- 0.247857% (n=16).
- Thanks to anholt for suggesting this.
- v2: Rebase on the spurious resets fix (change by anholt).
- Signed-off-by: Varad Gautam <varadgautam@gmail.com>
- Reviewed-by: Eric Anholt <eric@anholt.net>
- Signed-off-by: Eric Anholt <eric@anholt.net>
- (cherry picked from commit ca26d28bbaa39f31d5e7e4812603b015c8d54207)
- ---
- drivers/gpu/drm/vc4/vc4_drv.h | 37 +++++++++----
- drivers/gpu/drm/vc4/vc4_gem.c | 123 ++++++++++++++++++++++++++++++------------
- drivers/gpu/drm/vc4/vc4_irq.c | 58 ++++++++++++++++----
- 3 files changed, 166 insertions(+), 52 deletions(-)
- --- a/drivers/gpu/drm/vc4/vc4_drv.h
- +++ b/drivers/gpu/drm/vc4/vc4_drv.h
- @@ -53,7 +53,7 @@ struct vc4_dev {
- /* Protects bo_cache and the BO stats. */
- struct mutex bo_lock;
-
- - /* Sequence number for the last job queued in job_list.
- + /* Sequence number for the last job queued in bin_job_list.
- * Starts at 0 (no jobs emitted).
- */
- uint64_t emit_seqno;
- @@ -63,11 +63,19 @@ struct vc4_dev {
- */
- uint64_t finished_seqno;
-
- - /* List of all struct vc4_exec_info for jobs to be executed.
- - * The first job in the list is the one currently programmed
- - * into ct0ca/ct1ca for execution.
- + /* List of all struct vc4_exec_info for jobs to be executed in
- + * the binner. The first job in the list is the one currently
- + * programmed into ct0ca for execution.
- + */
- + struct list_head bin_job_list;
- +
- + /* List of all struct vc4_exec_info for jobs that have
- + * completed binning and are ready for rendering. The first
- + * job in the list is the one currently programmed into ct1ca
- + * for execution.
- */
- - struct list_head job_list;
- + struct list_head render_job_list;
- +
- /* List of the finished vc4_exec_infos waiting to be freed by
- * job_done_work.
- */
- @@ -291,11 +299,20 @@ struct vc4_exec_info {
- };
-
- static inline struct vc4_exec_info *
- -vc4_first_job(struct vc4_dev *vc4)
- +vc4_first_bin_job(struct vc4_dev *vc4)
- +{
- + if (list_empty(&vc4->bin_job_list))
- + return NULL;
- + return list_first_entry(&vc4->bin_job_list, struct vc4_exec_info, head);
- +}
- +
- +static inline struct vc4_exec_info *
- +vc4_first_render_job(struct vc4_dev *vc4)
- {
- - if (list_empty(&vc4->job_list))
- + if (list_empty(&vc4->render_job_list))
- return NULL;
- - return list_first_entry(&vc4->job_list, struct vc4_exec_info, head);
- + return list_first_entry(&vc4->render_job_list,
- + struct vc4_exec_info, head);
- }
-
- /**
- @@ -410,7 +427,9 @@ int vc4_wait_seqno_ioctl(struct drm_devi
- struct drm_file *file_priv);
- int vc4_wait_bo_ioctl(struct drm_device *dev, void *data,
- struct drm_file *file_priv);
- -void vc4_submit_next_job(struct drm_device *dev);
- +void vc4_submit_next_bin_job(struct drm_device *dev);
- +void vc4_submit_next_render_job(struct drm_device *dev);
- +void vc4_move_job_to_render(struct drm_device *dev, struct vc4_exec_info *exec);
- int vc4_wait_for_seqno(struct drm_device *dev, uint64_t seqno,
- uint64_t timeout_ns, bool interruptible);
- void vc4_job_handle_completed(struct vc4_dev *vc4);
- --- a/drivers/gpu/drm/vc4/vc4_gem.c
- +++ b/drivers/gpu/drm/vc4/vc4_gem.c
- @@ -154,10 +154,10 @@ vc4_save_hang_state(struct drm_device *d
- struct vc4_dev *vc4 = to_vc4_dev(dev);
- struct drm_vc4_get_hang_state *state;
- struct vc4_hang_state *kernel_state;
- - struct vc4_exec_info *exec;
- + struct vc4_exec_info *exec[2];
- struct vc4_bo *bo;
- unsigned long irqflags;
- - unsigned int i, unref_list_count;
- + unsigned int i, j, unref_list_count, prev_idx;
-
- kernel_state = kcalloc(1, sizeof(*kernel_state), GFP_KERNEL);
- if (!kernel_state)
- @@ -166,37 +166,55 @@ vc4_save_hang_state(struct drm_device *d
- state = &kernel_state->user_state;
-
- spin_lock_irqsave(&vc4->job_lock, irqflags);
- - exec = vc4_first_job(vc4);
- - if (!exec) {
- + exec[0] = vc4_first_bin_job(vc4);
- + exec[1] = vc4_first_render_job(vc4);
- + if (!exec[0] && !exec[1]) {
- spin_unlock_irqrestore(&vc4->job_lock, irqflags);
- return;
- }
-
- - unref_list_count = 0;
- - list_for_each_entry(bo, &exec->unref_list, unref_head)
- - unref_list_count++;
- -
- - state->bo_count = exec->bo_count + unref_list_count;
- - kernel_state->bo = kcalloc(state->bo_count, sizeof(*kernel_state->bo),
- - GFP_ATOMIC);
- + /* Get the bos from both binner and renderer into hang state. */
- + state->bo_count = 0;
- + for (i = 0; i < 2; i++) {
- + if (!exec[i])
- + continue;
- +
- + unref_list_count = 0;
- + list_for_each_entry(bo, &exec[i]->unref_list, unref_head)
- + unref_list_count++;
- + state->bo_count += exec[i]->bo_count + unref_list_count;
- + }
- +
- + kernel_state->bo = kcalloc(state->bo_count,
- + sizeof(*kernel_state->bo), GFP_ATOMIC);
- +
- if (!kernel_state->bo) {
- spin_unlock_irqrestore(&vc4->job_lock, irqflags);
- return;
- }
-
- - for (i = 0; i < exec->bo_count; i++) {
- - drm_gem_object_reference(&exec->bo[i]->base);
- - kernel_state->bo[i] = &exec->bo[i]->base;
- - }
- + prev_idx = 0;
- + for (i = 0; i < 2; i++) {
- + if (!exec[i])
- + continue;
- +
- + for (j = 0; j < exec[i]->bo_count; j++) {
- + drm_gem_object_reference(&exec[i]->bo[j]->base);
- + kernel_state->bo[j + prev_idx] = &exec[i]->bo[j]->base;
- + }
-
- - list_for_each_entry(bo, &exec->unref_list, unref_head) {
- - drm_gem_object_reference(&bo->base.base);
- - kernel_state->bo[i] = &bo->base.base;
- - i++;
- + list_for_each_entry(bo, &exec[i]->unref_list, unref_head) {
- + drm_gem_object_reference(&bo->base.base);
- + kernel_state->bo[j + prev_idx] = &bo->base.base;
- + j++;
- + }
- + prev_idx = j + 1;
- }
-
- - state->start_bin = exec->ct0ca;
- - state->start_render = exec->ct1ca;
- + if (exec[0])
- + state->start_bin = exec[0]->ct0ca;
- + if (exec[1])
- + state->start_render = exec[1]->ct1ca;
-
- spin_unlock_irqrestore(&vc4->job_lock, irqflags);
-
- @@ -272,13 +290,15 @@ vc4_hangcheck_elapsed(unsigned long data
- struct vc4_dev *vc4 = to_vc4_dev(dev);
- uint32_t ct0ca, ct1ca;
- unsigned long irqflags;
- - struct vc4_exec_info *exec;
- + struct vc4_exec_info *bin_exec, *render_exec;
-
- spin_lock_irqsave(&vc4->job_lock, irqflags);
- - exec = vc4_first_job(vc4);
- +
- + bin_exec = vc4_first_bin_job(vc4);
- + render_exec = vc4_first_render_job(vc4);
-
- /* If idle, we can stop watching for hangs. */
- - if (!exec) {
- + if (!bin_exec && !render_exec) {
- spin_unlock_irqrestore(&vc4->job_lock, irqflags);
- return;
- }
- @@ -289,9 +309,12 @@ vc4_hangcheck_elapsed(unsigned long data
- /* If we've made any progress in execution, rearm the timer
- * and wait.
- */
- - if (ct0ca != exec->last_ct0ca || ct1ca != exec->last_ct1ca) {
- - exec->last_ct0ca = ct0ca;
- - exec->last_ct1ca = ct1ca;
- + if ((bin_exec && ct0ca != bin_exec->last_ct0ca) ||
- + (render_exec && ct1ca != render_exec->last_ct1ca)) {
- + if (bin_exec)
- + bin_exec->last_ct0ca = ct0ca;
- + if (render_exec)
- + render_exec->last_ct1ca = ct1ca;
- spin_unlock_irqrestore(&vc4->job_lock, irqflags);
- vc4_queue_hangcheck(dev);
- return;
- @@ -391,11 +414,13 @@ vc4_flush_caches(struct drm_device *dev)
- * The job_lock should be held during this.
- */
- void
- -vc4_submit_next_job(struct drm_device *dev)
- +vc4_submit_next_bin_job(struct drm_device *dev)
- {
- struct vc4_dev *vc4 = to_vc4_dev(dev);
- - struct vc4_exec_info *exec = vc4_first_job(vc4);
- + struct vc4_exec_info *exec;
-
- +again:
- + exec = vc4_first_bin_job(vc4);
- if (!exec)
- return;
-
- @@ -405,11 +430,40 @@ vc4_submit_next_job(struct drm_device *d
- V3D_WRITE(V3D_BPOA, 0);
- V3D_WRITE(V3D_BPOS, 0);
-
- - if (exec->ct0ca != exec->ct0ea)
- + /* Either put the job in the binner if it uses the binner, or
- + * immediately move it to the to-be-rendered queue.
- + */
- + if (exec->ct0ca != exec->ct0ea) {
- submit_cl(dev, 0, exec->ct0ca, exec->ct0ea);
- + } else {
- + vc4_move_job_to_render(dev, exec);
- + goto again;
- + }
- +}
- +
- +void
- +vc4_submit_next_render_job(struct drm_device *dev)
- +{
- + struct vc4_dev *vc4 = to_vc4_dev(dev);
- + struct vc4_exec_info *exec = vc4_first_render_job(vc4);
- +
- + if (!exec)
- + return;
- +
- submit_cl(dev, 1, exec->ct1ca, exec->ct1ea);
- }
-
- +void
- +vc4_move_job_to_render(struct drm_device *dev, struct vc4_exec_info *exec)
- +{
- + struct vc4_dev *vc4 = to_vc4_dev(dev);
- + bool was_empty = list_empty(&vc4->render_job_list);
- +
- + list_move_tail(&exec->head, &vc4->render_job_list);
- + if (was_empty)
- + vc4_submit_next_render_job(dev);
- +}
- +
- static void
- vc4_update_bo_seqnos(struct vc4_exec_info *exec, uint64_t seqno)
- {
- @@ -448,14 +502,14 @@ vc4_queue_submit(struct drm_device *dev,
- exec->seqno = seqno;
- vc4_update_bo_seqnos(exec, seqno);
-
- - list_add_tail(&exec->head, &vc4->job_list);
- + list_add_tail(&exec->head, &vc4->bin_job_list);
-
- /* If no job was executing, kick ours off. Otherwise, it'll
- - * get started when the previous job's frame done interrupt
- + * get started when the previous job's flush done interrupt
- * occurs.
- */
- - if (vc4_first_job(vc4) == exec) {
- - vc4_submit_next_job(dev);
- + if (vc4_first_bin_job(vc4) == exec) {
- + vc4_submit_next_bin_job(dev);
- vc4_queue_hangcheck(dev);
- }
-
- @@ -849,7 +903,8 @@ vc4_gem_init(struct drm_device *dev)
- {
- struct vc4_dev *vc4 = to_vc4_dev(dev);
-
- - INIT_LIST_HEAD(&vc4->job_list);
- + INIT_LIST_HEAD(&vc4->bin_job_list);
- + INIT_LIST_HEAD(&vc4->render_job_list);
- INIT_LIST_HEAD(&vc4->job_done_list);
- INIT_LIST_HEAD(&vc4->seqno_cb_list);
- spin_lock_init(&vc4->job_lock);
- --- a/drivers/gpu/drm/vc4/vc4_irq.c
- +++ b/drivers/gpu/drm/vc4/vc4_irq.c
- @@ -30,6 +30,10 @@
- * disables that specific interrupt, and 0s written are ignored
- * (reading either one returns the set of enabled interrupts).
- *
- + * When we take a binning flush done interrupt, we need to submit the
- + * next frame for binning and move the finished frame to the render
- + * thread.
- + *
- * When we take a render frame interrupt, we need to wake the
- * processes waiting for some frame to be done, and get the next frame
- * submitted ASAP (so the hardware doesn't sit idle when there's work
- @@ -44,6 +48,7 @@
- #include "vc4_regs.h"
-
- #define V3D_DRIVER_IRQS (V3D_INT_OUTOMEM | \
- + V3D_INT_FLDONE | \
- V3D_INT_FRDONE)
-
- DECLARE_WAIT_QUEUE_HEAD(render_wait);
- @@ -77,7 +82,7 @@ vc4_overflow_mem_work(struct work_struct
- unsigned long irqflags;
-
- spin_lock_irqsave(&vc4->job_lock, irqflags);
- - current_exec = vc4_first_job(vc4);
- + current_exec = vc4_first_bin_job(vc4);
- if (current_exec) {
- vc4->overflow_mem->seqno = vc4->finished_seqno + 1;
- list_add_tail(&vc4->overflow_mem->unref_head,
- @@ -98,17 +103,43 @@ vc4_overflow_mem_work(struct work_struct
- }
-
- static void
- -vc4_irq_finish_job(struct drm_device *dev)
- +vc4_irq_finish_bin_job(struct drm_device *dev)
- +{
- + struct vc4_dev *vc4 = to_vc4_dev(dev);
- + struct vc4_exec_info *exec = vc4_first_bin_job(vc4);
- +
- + if (!exec)
- + return;
- +
- + vc4_move_job_to_render(dev, exec);
- + vc4_submit_next_bin_job(dev);
- +}
- +
- +static void
- +vc4_cancel_bin_job(struct drm_device *dev)
- +{
- + struct vc4_dev *vc4 = to_vc4_dev(dev);
- + struct vc4_exec_info *exec = vc4_first_bin_job(vc4);
- +
- + if (!exec)
- + return;
- +
- + list_move_tail(&exec->head, &vc4->bin_job_list);
- + vc4_submit_next_bin_job(dev);
- +}
- +
- +static void
- +vc4_irq_finish_render_job(struct drm_device *dev)
- {
- struct vc4_dev *vc4 = to_vc4_dev(dev);
- - struct vc4_exec_info *exec = vc4_first_job(vc4);
- + struct vc4_exec_info *exec = vc4_first_render_job(vc4);
-
- if (!exec)
- return;
-
- vc4->finished_seqno++;
- list_move_tail(&exec->head, &vc4->job_done_list);
- - vc4_submit_next_job(dev);
- + vc4_submit_next_render_job(dev);
-
- wake_up_all(&vc4->job_wait_queue);
- schedule_work(&vc4->job_done_work);
- @@ -125,9 +156,10 @@ vc4_irq(int irq, void *arg)
- barrier();
- intctl = V3D_READ(V3D_INTCTL);
-
- - /* Acknowledge the interrupts we're handling here. The render
- - * frame done interrupt will be cleared, while OUTOMEM will
- - * stay high until the underlying cause is cleared.
- + /* Acknowledge the interrupts we're handling here. The binner
- + * last flush / render frame done interrupt will be cleared,
- + * while OUTOMEM will stay high until the underlying cause is
- + * cleared.
- */
- V3D_WRITE(V3D_INTCTL, intctl);
-
- @@ -138,9 +170,16 @@ vc4_irq(int irq, void *arg)
- status = IRQ_HANDLED;
- }
-
- + if (intctl & V3D_INT_FLDONE) {
- + spin_lock(&vc4->job_lock);
- + vc4_irq_finish_bin_job(dev);
- + spin_unlock(&vc4->job_lock);
- + status = IRQ_HANDLED;
- + }
- +
- if (intctl & V3D_INT_FRDONE) {
- spin_lock(&vc4->job_lock);
- - vc4_irq_finish_job(dev);
- + vc4_irq_finish_render_job(dev);
- spin_unlock(&vc4->job_lock);
- status = IRQ_HANDLED;
- }
- @@ -205,6 +244,7 @@ void vc4_irq_reset(struct drm_device *de
- V3D_WRITE(V3D_INTENA, V3D_DRIVER_IRQS);
-
- spin_lock_irqsave(&vc4->job_lock, irqflags);
- - vc4_irq_finish_job(dev);
- + vc4_cancel_bin_job(dev);
- + vc4_irq_finish_render_job(dev);
- spin_unlock_irqrestore(&vc4->job_lock, irqflags);
- }
|