User: | Jiri Slaby |
Error type: | Leaving function in locked state |
Error type description: | Some lock is not unlocked on all paths of a function, so it is leaked |
File location: | kernel/futex.c |
Line in file: | 978 |
Project: | Linux Kernel |
Project version: | 2.6.28 |
Tools: |
Undetermined 1
Stanse (1.2) |
Entered: | 2012-03-04 17:07:06 UTC |
1/* 2 * Fast Userspace Mutexes (which I call "Futexes!"). 3 * (C) Rusty Russell, IBM 2002 4 * 5 * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar 6 * (C) Copyright 2003 Red Hat Inc, All Rights Reserved 7 * 8 * Removed page pinning, fix privately mapped COW pages and other cleanups 9 * (C) Copyright 2003, 2004 Jamie Lokier 10 * 11 * Robust futex support started by Ingo Molnar 12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved 13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes. 14 * 15 * PI-futex support started by Ingo Molnar and Thomas Gleixner 16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> 18 * 19 * PRIVATE futexes by Eric Dumazet 20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> 21 * 22 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 23 * enough at me, Linus for the original (flawed) idea, Matthew 24 * Kirkwood for proof-of-concept implementation. 25 * 26 * "The futexes are also cursed." 27 * "But they come in a choice of three flavours!" 28 * 29 * This program is free software; you can redistribute it and/or modify 30 * it under the terms of the GNU General Public License as published by 31 * the Free Software Foundation; either version 2 of the License, or 32 * (at your option) any later version. 33 * 34 * This program is distributed in the hope that it will be useful, 35 * but WITHOUT ANY WARRANTY; without even the implied warranty of 36 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 37 * GNU General Public License for more details. 38 * 39 * You should have received a copy of the GNU General Public License 40 * along with this program; if not, write to the Free Software 41 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 42 */ 43#include <linux/slab.h> 44#include <linux/poll.h> 45#include <linux/fs.h> 46#include <linux/file.h> 47#include <linux/jhash.h> 48#include <linux/init.h> 49#include <linux/futex.h> 50#include <linux/mount.h> 51#include <linux/pagemap.h> 52#include <linux/syscalls.h> 53#include <linux/signal.h> 54#include <linux/module.h> 55#include <linux/magic.h> 56#include <linux/pid.h> 57#include <linux/nsproxy.h> 58 59#include <asm/futex.h> 60 61#include "rtmutex_common.h" 62 63int __read_mostly futex_cmpxchg_enabled; 64 65#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 66 67/* 68 * Priority Inheritance state: 69 */ 70struct futex_pi_state { 71 /* 72 * list of 'owned' pi_state instances - these have to be 73 * cleaned up in do_exit() if the task exits prematurely: 74 */ 75 struct list_head list; 76 77 /* 78 * The PI object: 79 */ 80 struct rt_mutex pi_mutex; 81 82 struct task_struct *owner; 83 atomic_t refcount; 84 85 union futex_key key; 86}; 87 88/* 89 * We use this hashed waitqueue instead of a normal wait_queue_t, so 90 * we can wake only the relevant ones (hashed queues may be shared). 91 * 92 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 93 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. 94 * The order of wakup is always to make the first condition true, then 95 * wake up q->waiters, then make the second condition true. 96 */ 97struct futex_q { 98 struct plist_node list; 99 wait_queue_head_t waiters; 100 101 /* Which hash list lock to use: */ 102 spinlock_t *lock_ptr; 103 104 /* Key which the futex is hashed on: */ 105 union futex_key key; 106 107 /* Optional priority inheritance state: */ 108 struct futex_pi_state *pi_state; 109 struct task_struct *task; 110 111 /* Bitset for the optional bitmasked wakeup */ 112 u32 bitset; 113}; 114 115/* 116 * Split the global futex_lock into every hash list lock. 117 */ 118struct futex_hash_bucket { 119 spinlock_t lock; 120 struct plist_head chain; 121}; 122 123static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; 124 125/* 126 * Take mm->mmap_sem, when futex is shared 127 */ 128static inline void futex_lock_mm(struct rw_semaphore *fshared) 129{ 130 if (fshared) 131 down_read(fshared); 132} 133 134/* 135 * Release mm->mmap_sem, when the futex is shared 136 */ 137static inline void futex_unlock_mm(struct rw_semaphore *fshared) 138{ 139 if (fshared) 140 up_read(fshared); 141} 142 143/* 144 * We hash on the keys returned from get_futex_key (see below). 145 */ 146static struct futex_hash_bucket *hash_futex(union futex_key *key) 147{ 148 u32 hash = jhash2((u32*)&key->both.word, 149 (sizeof(key->both.word)+sizeof(key->both.ptr))/4, 150 key->both.offset); 151 return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)]; 152} 153 154/* 155 * Return 1 if two futex_keys are equal, 0 otherwise. 156 */ 157static inline int match_futex(union futex_key *key1, union futex_key *key2) 158{ 159 return (key1->both.word == key2->both.word 160 && key1->both.ptr == key2->both.ptr 161 && key1->both.offset == key2->both.offset); 162} 163 164/** 165 * get_futex_key - Get parameters which are the keys for a futex. 166 * @uaddr: virtual address of the futex 167 * @shared: NULL for a PROCESS_PRIVATE futex, 168 * ¤t->mm->mmap_sem for a PROCESS_SHARED futex 169 * @key: address where result is stored. 170 * 171 * Returns a negative error code or 0 172 * The key words are stored in *key on success. 173 * 174 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, 175 * offset_within_page). For private mappings, it's (uaddr, current->mm). 176 * We can usually work out the index without swapping in the page. 177 * 178 * fshared is NULL for PROCESS_PRIVATE futexes 179 * For other futexes, it points to ¤t->mm->mmap_sem and 180 * caller must have taken the reader lock. but NOT any spinlocks. 181 */ 182static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, 183 union futex_key *key) 184{ 185 unsigned long address = (unsigned long)uaddr; 186 struct mm_struct *mm = current->mm; 187 struct vm_area_struct *vma; 188 struct page *page; 189 int err; 190 191 /* 192 * The futex address must be "naturally" aligned. 193 */ 194 key->both.offset = address % PAGE_SIZE; 195 if (unlikely((address % sizeof(u32)) != 0)) 196 return -EINVAL; 197 address -= key->both.offset; 198 199 /* 200 * PROCESS_PRIVATE futexes are fast. 201 * As the mm cannot disappear under us and the 'key' only needs 202 * virtual address, we dont even have to find the underlying vma. 203 * Note : We do have to check 'uaddr' is a valid user address, 204 * but access_ok() should be faster than find_vma() 205 */ 206 if (!fshared) { 207 if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) 208 return -EFAULT; 209 key->private.mm = mm; 210 key->private.address = address; 211 return 0; 212 } 213 /* 214 * The futex is hashed differently depending on whether 215 * it's in a shared or private mapping. So check vma first. 216 */ 217 vma = find_extend_vma(mm, address); 218 if (unlikely(!vma)) 219 return -EFAULT; 220 221 /* 222 * Permissions. 223 */ 224 if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ)) 225 return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; 226 227 /* 228 * Private mappings are handled in a simple way. 229 * 230 * NOTE: When userspace waits on a MAP_SHARED mapping, even if 231 * it's a read-only handle, it's expected that futexes attach to 232 * the object not the particular process. Therefore we use 233 * VM_MAYSHARE here, not VM_SHARED which is restricted to shared 234 * mappings of _writable_ handles. 235 */ 236 if (likely(!(vma->vm_flags & VM_MAYSHARE))) { 237 key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */ 238 key->private.mm = mm; 239 key->private.address = address; 240 return 0; 241 } 242 243 /* 244 * Linear file mappings are also simple. 245 */ 246 key->shared.inode = vma->vm_file->f_path.dentry->d_inode; 247 key->both.offset |= FUT_OFF_INODE; /* inode-based key. */ 248 if (likely(!(vma->vm_flags & VM_NONLINEAR))) { 249 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) 250 + vma->vm_pgoff); 251 return 0; 252 } 253 254 /* 255 * We could walk the page table to read the non-linear 256 * pte, and get the page index without fetching the page 257 * from swap. But that's a lot of code to duplicate here 258 * for a rare case, so we simply fetch the page. 259 */ 260 err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL); 261 if (err >= 0) { 262 key->shared.pgoff = 263 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 264 put_page(page); 265 return 0; 266 } 267 return err; 268} 269 270/* 271 * Take a reference to the resource addressed by a key. 272 * Can be called while holding spinlocks. 273 * 274 */ 275static void get_futex_key_refs(union futex_key *key) 276{ 277 if (key->both.ptr == NULL) 278 return; 279 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 280 case FUT_OFF_INODE: 281 atomic_inc(&key->shared.inode->i_count); 282 break; 283 case FUT_OFF_MMSHARED: 284 atomic_inc(&key->private.mm->mm_count); 285 break; 286 } 287} 288 289/* 290 * Drop a reference to the resource addressed by a key. 291 * The hash bucket spinlock must not be held. 292 */ 293static void drop_futex_key_refs(union futex_key *key) 294{ 295 if (!key->both.ptr) 296 return; 297 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 298 case FUT_OFF_INODE: 299 iput(key->shared.inode); 300 break; 301 case FUT_OFF_MMSHARED: 302 mmdrop(key->private.mm); 303 break; 304 } 305} 306 307static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) 308{ 309 u32 curval; 310 311 pagefault_disable(); 312 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 313 pagefault_enable(); 314 315 return curval; 316} 317 318static int get_futex_value_locked(u32 *dest, u32 __user *from) 319{ 320 int ret; 321 322 pagefault_disable(); 323 ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); 324 pagefault_enable(); 325 326 return ret ? -EFAULT : 0; 327} 328 329/* 330 * Fault handling. 331 * if fshared is non NULL, current->mm->mmap_sem is already held 332 */ 333static int futex_handle_fault(unsigned long address, 334 struct rw_semaphore *fshared, int attempt) 335{ 336 struct vm_area_struct * vma; 337 struct mm_struct *mm = current->mm; 338 int ret = -EFAULT; 339 340 if (attempt > 2) 341 return ret; 342 343 if (!fshared) 344 down_read(&mm->mmap_sem); 345 vma = find_vma(mm, address); 346 if (vma && address >= vma->vm_start && 347 (vma->vm_flags & VM_WRITE)) { 348 int fault; 349 fault = handle_mm_fault(mm, vma, address, 1); 350 if (unlikely((fault & VM_FAULT_ERROR))) { 351#if 0 352 /* XXX: let's do this when we verify it is OK */ 353 if (ret & VM_FAULT_OOM) 354 ret = -ENOMEM; 355#endif 356 } else { 357 ret = 0; 358 if (fault & VM_FAULT_MAJOR) 359 current->maj_flt++; 360 else 361 current->min_flt++; 362 } 363 } 364 if (!fshared) 365 up_read(&mm->mmap_sem); 366 return ret; 367} 368 369/* 370 * PI code: 371 */ 372static int refill_pi_state_cache(void) 373{ 374 struct futex_pi_state *pi_state; 375 376 if (likely(current->pi_state_cache)) 377 return 0; 378 379 pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); 380 381 if (!pi_state) 382 return -ENOMEM; 383 384 INIT_LIST_HEAD(&pi_state->list); 385 /* pi_mutex gets initialized later */ 386 pi_state->owner = NULL; 387 atomic_set(&pi_state->refcount, 1); 388 389 current->pi_state_cache = pi_state; 390 391 return 0; 392} 393 394static struct futex_pi_state * alloc_pi_state(void) 395{ 396 struct futex_pi_state *pi_state = current->pi_state_cache; 397 398 WARN_ON(!pi_state); 399 current->pi_state_cache = NULL; 400 401 return pi_state; 402} 403 404static void free_pi_state(struct futex_pi_state *pi_state) 405{ 406 if (!atomic_dec_and_test(&pi_state->refcount)) 407 return; 408 409 /* 410 * If pi_state->owner is NULL, the owner is most probably dying 411 * and has cleaned up the pi_state already 412 */ 413 if (pi_state->owner) { 414 spin_lock_irq(&pi_state->owner->pi_lock); 415 list_del_init(&pi_state->list); 416 spin_unlock_irq(&pi_state->owner->pi_lock); 417 418 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); 419 } 420 421 if (current->pi_state_cache) 422 kfree(pi_state); 423 else { 424 /* 425 * pi_state->list is already empty. 426 * clear pi_state->owner. 427 * refcount is at 0 - put it back to 1. 428 */ 429 pi_state->owner = NULL; 430 atomic_set(&pi_state->refcount, 1); 431 current->pi_state_cache = pi_state; 432 } 433} 434 435/* 436 * Look up the task based on what TID userspace gave us. 437 * We dont trust it. 438 */ 439static struct task_struct * futex_find_get_task(pid_t pid) 440{ 441 struct task_struct *p; 442 443 rcu_read_lock(); 444 p = find_task_by_vpid(pid); 445 if (!p || ((current->euid != p->euid) && (current->euid != p->uid))) 446 p = ERR_PTR(-ESRCH); 447 else 448 get_task_struct(p); 449 450 rcu_read_unlock(); 451 452 return p; 453} 454 455/* 456 * This task is holding PI mutexes at exit time => bad. 457 * Kernel cleans up PI-state, but userspace is likely hosed. 458 * (Robust-futex cleanup is separate and might save the day for userspace.) 459 */ 460void exit_pi_state_list(struct task_struct *curr) 461{ 462 struct list_head *next, *head = &curr->pi_state_list; 463 struct futex_pi_state *pi_state; 464 struct futex_hash_bucket *hb; 465 union futex_key key; 466 467 if (!futex_cmpxchg_enabled) 468 return; 469 /* 470 * We are a ZOMBIE and nobody can enqueue itself on 471 * pi_state_list anymore, but we have to be careful 472 * versus waiters unqueueing themselves: 473 */ 474 spin_lock_irq(&curr->pi_lock); 475 while (!list_empty(head)) { 476 477 next = head->next; 478 pi_state = list_entry(next, struct futex_pi_state, list); 479 key = pi_state->key; 480 hb = hash_futex(&key); 481 spin_unlock_irq(&curr->pi_lock); 482 483 spin_lock(&hb->lock); 484 485 spin_lock_irq(&curr->pi_lock); 486 /* 487 * We dropped the pi-lock, so re-check whether this 488 * task still owns the PI-state: 489 */ 490 if (head->next != next) { 491 spin_unlock(&hb->lock); 492 continue; 493 } 494 495 WARN_ON(pi_state->owner != curr); 496 WARN_ON(list_empty(&pi_state->list)); 497 list_del_init(&pi_state->list); 498 pi_state->owner = NULL; 499 spin_unlock_irq(&curr->pi_lock); 500 501 rt_mutex_unlock(&pi_state->pi_mutex); 502 503 spin_unlock(&hb->lock); 504 505 spin_lock_irq(&curr->pi_lock); 506 } 507 spin_unlock_irq(&curr->pi_lock); 508} 509 510static int 511lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, 512 union futex_key *key, struct futex_pi_state **ps) 513{ 514 struct futex_pi_state *pi_state = NULL; 515 struct futex_q *this, *next; 516 struct plist_head *head; 517 struct task_struct *p; 518 pid_t pid = uval & FUTEX_TID_MASK; 519 520 head = &hb->chain; 521 522 plist_for_each_entry_safe(this, next, head, list) { 523 if (match_futex(&this->key, key)) { 524 /* 525 * Another waiter already exists - bump up 526 * the refcount and return its pi_state: 527 */ 528 pi_state = this->pi_state; 529 /* 530 * Userspace might have messed up non PI and PI futexes 531 */ 532 if (unlikely(!pi_state)) 533 return -EINVAL; 534 535 WARN_ON(!atomic_read(&pi_state->refcount)); 536 WARN_ON(pid && pi_state->owner && 537 pi_state->owner->pid != pid); 538 539 atomic_inc(&pi_state->refcount); 540 *ps = pi_state; 541 542 return 0; 543 } 544 } 545 546 /* 547 * We are the first waiter - try to look up the real owner and attach 548 * the new pi_state to it, but bail out when TID = 0 549 */ 550 if (!pid) 551 return -ESRCH; 552 p = futex_find_get_task(pid); 553 if (IS_ERR(p)) 554 return PTR_ERR(p); 555 556 /* 557 * We need to look at the task state flags to figure out, 558 * whether the task is exiting. To protect against the do_exit 559 * change of the task flags, we do this protected by 560 * p->pi_lock: 561 */ 562 spin_lock_irq(&p->pi_lock); 563 if (unlikely(p->flags & PF_EXITING)) { 564 /* 565 * The task is on the way out. When PF_EXITPIDONE is 566 * set, we know that the task has finished the 567 * cleanup: 568 */ 569 int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; 570 571 spin_unlock_irq(&p->pi_lock); 572 put_task_struct(p); 573 return ret; 574 } 575 576 pi_state = alloc_pi_state(); 577 578 /* 579 * Initialize the pi_mutex in locked state and make 'p' 580 * the owner of it: 581 */ 582 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); 583 584 /* Store the key for possible exit cleanups: */ 585 pi_state->key = *key; 586 587 WARN_ON(!list_empty(&pi_state->list)); 588 list_add(&pi_state->list, &p->pi_state_list); 589 pi_state->owner = p; 590 spin_unlock_irq(&p->pi_lock); 591 592 put_task_struct(p); 593 594 *ps = pi_state; 595 596 return 0; 597} 598 599/* 600 * The hash bucket lock must be held when this is called. 601 * Afterwards, the futex_q must not be accessed. 602 */ 603static void wake_futex(struct futex_q *q) 604{ 605 plist_del(&q->list, &q->list.plist); 606 /* 607 * The lock in wake_up_all() is a crucial memory barrier after the 608 * plist_del() and also before assigning to q->lock_ptr. 609 */ 610 wake_up_all(&q->waiters); 611 /* 612 * The waiting task can free the futex_q as soon as this is written, 613 * without taking any locks. This must come last. 614 * 615 * A memory barrier is required here to prevent the following store 616 * to lock_ptr from getting ahead of the wakeup. Clearing the lock 617 * at the end of wake_up_all() does not prevent this store from 618 * moving. 619 */ 620 smp_wmb(); 621 q->lock_ptr = NULL; 622} 623 624static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) 625{ 626 struct task_struct *new_owner; 627 struct futex_pi_state *pi_state = this->pi_state; 628 u32 curval, newval; 629 630 if (!pi_state) 631 return -EINVAL; 632 633 spin_lock(&pi_state->pi_mutex.wait_lock); 634 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); 635 636 /* 637 * This happens when we have stolen the lock and the original 638 * pending owner did not enqueue itself back on the rt_mutex. 639 * Thats not a tragedy. We know that way, that a lock waiter 640 * is on the fly. We make the futex_q waiter the pending owner. 641 */ 642 if (!new_owner) 643 new_owner = this->task; 644 645 /* 646 * We pass it to the next owner. (The WAITERS bit is always 647 * kept enabled while there is PI state around. We must also 648 * preserve the owner died bit.) 649 */ 650 if (!(uval & FUTEX_OWNER_DIED)) { 651 int ret = 0; 652 653 newval = FUTEX_WAITERS | task_pid_vnr(new_owner); 654 655 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 656 657 if (curval == -EFAULT) 658 ret = -EFAULT; 659 else if (curval != uval) 660 ret = -EINVAL; 661 if (ret) { 662 spin_unlock(&pi_state->pi_mutex.wait_lock); 663 return ret; 664 } 665 } 666 667 spin_lock_irq(&pi_state->owner->pi_lock); 668 WARN_ON(list_empty(&pi_state->list)); 669 list_del_init(&pi_state->list); 670 spin_unlock_irq(&pi_state->owner->pi_lock); 671 672 spin_lock_irq(&new_owner->pi_lock); 673 WARN_ON(!list_empty(&pi_state->list)); 674 list_add(&pi_state->list, &new_owner->pi_state_list); 675 pi_state->owner = new_owner; 676 spin_unlock_irq(&new_owner->pi_lock); 677 678 spin_unlock(&pi_state->pi_mutex.wait_lock); 679 rt_mutex_unlock(&pi_state->pi_mutex); 680 681 return 0; 682} 683 684static int unlock_futex_pi(u32 __user *uaddr, u32 uval) 685{ 686 u32 oldval; 687 688 /* 689 * There is no waiter, so we unlock the futex. The owner died 690 * bit has not to be preserved here. We are the owner: 691 */ 692 oldval = cmpxchg_futex_value_locked(uaddr, uval, 0); 693 694 if (oldval == -EFAULT) 695 return oldval; 696 if (oldval != uval) 697 return -EAGAIN; 698 699 return 0; 700} 701 702/* 703 * Express the locking dependencies for lockdep: 704 */ 705static inline void 706double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) 707{ 708 if (hb1 <= hb2) { 709 spin_lock(&hb1->lock); 710 if (hb1 < hb2) 711 spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); 712 } else { /* hb1 > hb2 */ 713 spin_lock(&hb2->lock); 714 spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); 715 } 716} 717 718/* 719 * Wake up all waiters hashed on the physical page that is mapped 720 * to this virtual address: 721 */ 722static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, 723 int nr_wake, u32 bitset) 724{ 725 struct futex_hash_bucket *hb; 726 struct futex_q *this, *next; 727 struct plist_head *head; 728 union futex_key key; 729 int ret; 730 731 if (!bitset) 732 return -EINVAL; 733 734 futex_lock_mm(fshared); 735 736 ret = get_futex_key(uaddr, fshared, &key); 737 if (unlikely(ret != 0)) 738 goto out; 739 740 hb = hash_futex(&key); 741 spin_lock(&hb->lock); 742 head = &hb->chain; 743 744 plist_for_each_entry_safe(this, next, head, list) { 745 if (match_futex (&this->key, &key)) { 746 if (this->pi_state) { 747 ret = -EINVAL; 748 break; 749 } 750 751 /* Check if one of the bits is set in both bitsets */ 752 if (!(this->bitset & bitset)) 753 continue; 754 755 wake_futex(this); 756 if (++ret >= nr_wake) 757 break; 758 } 759 } 760 761 spin_unlock(&hb->lock); 762out: 763 futex_unlock_mm(fshared); 764 return ret; 765} 766 767/* 768 * Wake up all waiters hashed on the physical page that is mapped 769 * to this virtual address: 770 */ 771static int 772futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared, 773 u32 __user *uaddr2, 774 int nr_wake, int nr_wake2, int op) 775{ 776 union futex_key key1, key2; 777 struct futex_hash_bucket *hb1, *hb2; 778 struct plist_head *head; 779 struct futex_q *this, *next; 780 int ret, op_ret, attempt = 0; 781 782retryfull: 783 futex_lock_mm(fshared); 784 785 ret = get_futex_key(uaddr1, fshared, &key1); 786 if (unlikely(ret != 0)) 787 goto out; 788 ret = get_futex_key(uaddr2, fshared, &key2); 789 if (unlikely(ret != 0)) 790 goto out; 791 792 hb1 = hash_futex(&key1); 793 hb2 = hash_futex(&key2); 794 795retry: 796 double_lock_hb(hb1, hb2); 797 798 op_ret = futex_atomic_op_inuser(op, uaddr2); 799 if (unlikely(op_ret < 0)) { 800 u32 dummy; 801 802 spin_unlock(&hb1->lock); 803 if (hb1 != hb2) 804 spin_unlock(&hb2->lock); 805 806#ifndef CONFIG_MMU 807 /* 808 * we don't get EFAULT from MMU faults if we don't have an MMU, 809 * but we might get them from range checking 810 */ 811 ret = op_ret; 812 goto out; 813#endif 814 815 if (unlikely(op_ret != -EFAULT)) { 816 ret = op_ret; 817 goto out; 818 } 819 820 /* 821 * futex_atomic_op_inuser needs to both read and write 822 * *(int __user *)uaddr2, but we can't modify it 823 * non-atomically. Therefore, if get_user below is not 824 * enough, we need to handle the fault ourselves, while 825 * still holding the mmap_sem. 826 */ 827 if (attempt++) { 828 ret = futex_handle_fault((unsigned long)uaddr2, 829 fshared, attempt); 830 if (ret) 831 goto out; 832 goto retry; 833 } 834 835 /* 836 * If we would have faulted, release mmap_sem, 837 * fault it in and start all over again. 838 */ 839 futex_unlock_mm(fshared); 840 841 ret = get_user(dummy, uaddr2); 842 if (ret) 843 return ret; 844 845 goto retryfull; 846 } 847 848 head = &hb1->chain; 849 850 plist_for_each_entry_safe(this, next, head, list) { 851 if (match_futex (&this->key, &key1)) { 852 wake_futex(this); 853 if (++ret >= nr_wake) 854 break; 855 } 856 } 857 858 if (op_ret > 0) { 859 head = &hb2->chain; 860 861 op_ret = 0; 862 plist_for_each_entry_safe(this, next, head, list) { 863 if (match_futex (&this->key, &key2)) { 864 wake_futex(this); 865 if (++op_ret >= nr_wake2) 866 break; 867 } 868 } 869 ret += op_ret; 870 } 871 872 spin_unlock(&hb1->lock); 873 if (hb1 != hb2) 874 spin_unlock(&hb2->lock); 875out: 876 futex_unlock_mm(fshared); 877 878 return ret; 879} 880 881/* 882 * Requeue all waiters hashed on one physical page to another 883 * physical page. 884 */ 885static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, 886 u32 __user *uaddr2, 887 int nr_wake, int nr_requeue, u32 *cmpval) 888{ 889 union futex_key key1, key2; 890 struct futex_hash_bucket *hb1, *hb2; 891 struct plist_head *head1; 892 struct futex_q *this, *next; 893 int ret, drop_count = 0; 894 895 retry: 896 futex_lock_mm(fshared); 897 898 ret = get_futex_key(uaddr1, fshared, &key1); 899 if (unlikely(ret != 0)) 900 goto out; 901 ret = get_futex_key(uaddr2, fshared, &key2); 902 if (unlikely(ret != 0)) 903 goto out; 904 905 hb1 = hash_futex(&key1); 906 hb2 = hash_futex(&key2); 907 908 double_lock_hb(hb1, hb2); 909 910 if (likely(cmpval != NULL)) { 911 u32 curval; 912 913 ret = get_futex_value_locked(&curval, uaddr1); 914 915 if (unlikely(ret)) { 916 spin_unlock(&hb1->lock); 917 if (hb1 != hb2) 918 spin_unlock(&hb2->lock); 919 920 /* 921 * If we would have faulted, release mmap_sem, fault 922 * it in and start all over again. 923 */ 924 futex_unlock_mm(fshared); 925 926 ret = get_user(curval, uaddr1); 927 928 if (!ret) 929 goto retry; 930 931 return ret; 932 } 933 if (curval != *cmpval) { 934 ret = -EAGAIN; 935 goto out_unlock; 936 } 937 } 938 939 head1 = &hb1->chain; 940 plist_for_each_entry_safe(this, next, head1, list) { 941 if (!match_futex (&this->key, &key1)) 942 continue; 943 if (++ret <= nr_wake) { 944 wake_futex(this); 945 } else { 946 /* 947 * If key1 and key2 hash to the same bucket, no need to 948 * requeue. 949 */ 950 if (likely(head1 != &hb2->chain)) { 951 plist_del(&this->list, &hb1->chain); 952 plist_add(&this->list, &hb2->chain); 953 this->lock_ptr = &hb2->lock; 954#ifdef CONFIG_DEBUG_PI_LIST 955 this->list.plist.lock = &hb2->lock; 956#endif 957 } 958 this->key = key2; 959 get_futex_key_refs(&key2); 960 drop_count++; 961 962 if (ret - nr_wake >= nr_requeue) 963 break; 964 } 965 } 966 967out_unlock: 968 spin_unlock(&hb1->lock); 969 if (hb1 != hb2) 970 spin_unlock(&hb2->lock); 971 972 /* drop_futex_key_refs() must be called outside the spinlocks. */ 973 while (--drop_count >= 0) 974 drop_futex_key_refs(&key1); 975 976out: 977 futex_unlock_mm(fshared); 978 return ret; 979} 980 981/* The key must be already stored in q->key. */ 982static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) 983{ 984 struct futex_hash_bucket *hb; 985 986 init_waitqueue_head(&q->waiters); 987 988 get_futex_key_refs(&q->key); 989 hb = hash_futex(&q->key); 990 q->lock_ptr = &hb->lock; 991 992 spin_lock(&hb->lock); 993 return hb; 994} 995 996static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 997{ 998 int prio; 999 1000 /* 1001 * The priority used to register this element is 1002 * - either the real thread-priority for the real-time threads 1003 * (i.e. threads with a priority lower than MAX_RT_PRIO) 1004 * - or MAX_RT_PRIO for non-RT threads. 1005 * Thus, all RT-threads are woken first in priority order, and 1006 * the others are woken last, in FIFO order. 1007 */ 1008 prio = min(current->normal_prio, MAX_RT_PRIO); 1009 1010 plist_node_init(&q->list, prio); 1011#ifdef CONFIG_DEBUG_PI_LIST 1012 q->list.plist.lock = &hb->lock; 1013#endif 1014 plist_add(&q->list, &hb->chain); 1015 q->task = current; 1016 spin_unlock(&hb->lock); 1017} 1018 1019static inline void 1020queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) 1021{ 1022 spin_unlock(&hb->lock); 1023 drop_futex_key_refs(&q->key); 1024} 1025 1026/* 1027 * queue_me and unqueue_me must be called as a pair, each 1028 * exactly once. They are called with the hashed spinlock held. 1029 */ 1030 1031/* Return 1 if we were still queued (ie. 0 means we were woken) */ 1032static int unqueue_me(struct futex_q *q) 1033{ 1034 spinlock_t *lock_ptr; 1035 int ret = 0; 1036 1037 /* In the common case we don't take the spinlock, which is nice. */ 1038 retry: 1039 lock_ptr = q->lock_ptr; 1040 barrier(); 1041 if (lock_ptr != NULL) { 1042 spin_lock(lock_ptr); 1043 /* 1044 * q->lock_ptr can change between reading it and 1045 * spin_lock(), causing us to take the wrong lock. This 1046 * corrects the race condition. 1047 * 1048 * Reasoning goes like this: if we have the wrong lock, 1049 * q->lock_ptr must have changed (maybe several times) 1050 * between reading it and the spin_lock(). It can 1051 * change again after the spin_lock() but only if it was 1052 * already changed before the spin_lock(). It cannot, 1053 * however, change back to the original value. Therefore 1054 * we can detect whether we acquired the correct lock. 1055 */ 1056 if (unlikely(lock_ptr != q->lock_ptr)) { 1057 spin_unlock(lock_ptr); 1058 goto retry; 1059 } 1060 WARN_ON(plist_node_empty(&q->list)); 1061 plist_del(&q->list, &q->list.plist); 1062 1063 BUG_ON(q->pi_state); 1064 1065 spin_unlock(lock_ptr); 1066 ret = 1; 1067 } 1068 1069 drop_futex_key_refs(&q->key); 1070 return ret; 1071} 1072 1073/* 1074 * PI futexes can not be requeued and must remove themself from the 1075 * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry 1076 * and dropped here. 1077 */ 1078static void unqueue_me_pi(struct futex_q *q) 1079{ 1080 WARN_ON(plist_node_empty(&q->list)); 1081 plist_del(&q->list, &q->list.plist); 1082 1083 BUG_ON(!q->pi_state); 1084 free_pi_state(q->pi_state); 1085 q->pi_state = NULL; 1086 1087 spin_unlock(q->lock_ptr); 1088 1089 drop_futex_key_refs(&q->key); 1090} 1091 1092/* 1093 * Fixup the pi_state owner with the new owner. 1094 * 1095 * Must be called with hash bucket lock held and mm->sem held for non 1096 * private futexes. 1097 */ 1098static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1099 struct task_struct *newowner, 1100 struct rw_semaphore *fshared) 1101{ 1102 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 1103 struct futex_pi_state *pi_state = q->pi_state; 1104 struct task_struct *oldowner = pi_state->owner; 1105 u32 uval, curval, newval; 1106 int ret, attempt = 0; 1107 1108 /* Owner died? */ 1109 if (!pi_state->owner) 1110 newtid |= FUTEX_OWNER_DIED; 1111 1112 /* 1113 * We are here either because we stole the rtmutex from the 1114 * pending owner or we are the pending owner which failed to 1115 * get the rtmutex. We have to replace the pending owner TID 1116 * in the user space variable. This must be atomic as we have 1117 * to preserve the owner died bit here. 1118 * 1119 * Note: We write the user space value _before_ changing the 1120 * pi_state because we can fault here. Imagine swapped out 1121 * pages or a fork, which was running right before we acquired 1122 * mmap_sem, that marked all the anonymous memory readonly for 1123 * cow. 1124 * 1125 * Modifying pi_state _before_ the user space value would 1126 * leave the pi_state in an inconsistent state when we fault 1127 * here, because we need to drop the hash bucket lock to 1128 * handle the fault. This might be observed in the PID check 1129 * in lookup_pi_state. 1130 */ 1131retry: 1132 if (get_futex_value_locked(&uval, uaddr)) 1133 goto handle_fault; 1134 1135 while (1) { 1136 newval = (uval & FUTEX_OWNER_DIED) | newtid; 1137 1138 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 1139 1140 if (curval == -EFAULT) 1141 goto handle_fault; 1142 if (curval == uval) 1143 break; 1144 uval = curval; 1145 } 1146 1147 /* 1148 * We fixed up user space. Now we need to fix the pi_state 1149 * itself. 1150 */ 1151 if (pi_state->owner != NULL) { 1152 spin_lock_irq(&pi_state->owner->pi_lock); 1153 WARN_ON(list_empty(&pi_state->list)); 1154 list_del_init(&pi_state->list); 1155 spin_unlock_irq(&pi_state->owner->pi_lock); 1156 } 1157 1158 pi_state->owner = newowner; 1159 1160 spin_lock_irq(&newowner->pi_lock); 1161 WARN_ON(!list_empty(&pi_state->list)); 1162 list_add(&pi_state->list, &newowner->pi_state_list); 1163 spin_unlock_irq(&newowner->pi_lock); 1164 return 0; 1165 1166 /* 1167 * To handle the page fault we need to drop the hash bucket 1168 * lock here. That gives the other task (either the pending 1169 * owner itself or the task which stole the rtmutex) the 1170 * chance to try the fixup of the pi_state. So once we are 1171 * back from handling the fault we need to check the pi_state 1172 * after reacquiring the hash bucket lock and before trying to 1173 * do another fixup. When the fixup has been done already we 1174 * simply return. 1175 */ 1176handle_fault: 1177 spin_unlock(q->lock_ptr); 1178 1179 ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); 1180 1181 spin_lock(q->lock_ptr); 1182 1183 /* 1184 * Check if someone else fixed it for us: 1185 */ 1186 if (pi_state->owner != oldowner) 1187 return 0; 1188 1189 if (ret) 1190 return ret; 1191 1192 goto retry; 1193} 1194 1195/* 1196 * In case we must use restart_block to restart a futex_wait, 1197 * we encode in the 'flags' shared capability 1198 */ 1199#define FLAGS_SHARED 1 1200 1201static long futex_wait_restart(struct restart_block *restart); 1202 1203static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, 1204 u32 val, ktime_t *abs_time, u32 bitset) 1205{ 1206 struct task_struct *curr = current; 1207 DECLARE_WAITQUEUE(wait, curr); 1208 struct futex_hash_bucket *hb; 1209 struct futex_q q; 1210 u32 uval; 1211 int ret; 1212 struct hrtimer_sleeper t; 1213 int rem = 0; 1214 1215 if (!bitset) 1216 return -EINVAL; 1217 1218 q.pi_state = NULL; 1219 q.bitset = bitset; 1220 retry: 1221 futex_lock_mm(fshared); 1222 1223 ret = get_futex_key(uaddr, fshared, &q.key); 1224 if (unlikely(ret != 0)) 1225 goto out_release_sem; 1226 1227 hb = queue_lock(&q); 1228 1229 /* 1230 * Access the page AFTER the futex is queued. 1231 * Order is important: 1232 * 1233 * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); 1234 * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } 1235 * 1236 * The basic logical guarantee of a futex is that it blocks ONLY 1237 * if cond(var) is known to be true at the time of blocking, for 1238 * any cond. If we queued after testing *uaddr, that would open 1239 * a race condition where we could block indefinitely with 1240 * cond(var) false, which would violate the guarantee. 1241 * 1242 * A consequence is that futex_wait() can return zero and absorb 1243 * a wakeup when *uaddr != val on entry to the syscall. This is 1244 * rare, but normal. 1245 * 1246 * for shared futexes, we hold the mmap semaphore, so the mapping 1247 * cannot have changed since we looked it up in get_futex_key. 1248 */ 1249 ret = get_futex_value_locked(&uval, uaddr); 1250 1251 if (unlikely(ret)) { 1252 queue_unlock(&q, hb); 1253 1254 /* 1255 * If we would have faulted, release mmap_sem, fault it in and 1256 * start all over again. 1257 */ 1258 futex_unlock_mm(fshared); 1259 1260 ret = get_user(uval, uaddr); 1261 1262 if (!ret) 1263 goto retry; 1264 return ret; 1265 } 1266 ret = -EWOULDBLOCK; 1267 if (uval != val) 1268 goto out_unlock_release_sem; 1269 1270 /* Only actually queue if *uaddr contained val. */ 1271 queue_me(&q, hb); 1272 1273 /* 1274 * Now the futex is queued and we have checked the data, we 1275 * don't want to hold mmap_sem while we sleep. 1276 */ 1277 futex_unlock_mm(fshared); 1278 1279 /* 1280 * There might have been scheduling since the queue_me(), as we 1281 * cannot hold a spinlock across the get_user() in case it 1282 * faults, and we cannot just set TASK_INTERRUPTIBLE state when 1283 * queueing ourselves into the futex hash. This code thus has to 1284 * rely on the futex_wake() code removing us from hash when it 1285 * wakes us up. 1286 */ 1287 1288 /* add_wait_queue is the barrier after __set_current_state. */ 1289 __set_current_state(TASK_INTERRUPTIBLE); 1290 add_wait_queue(&q.waiters, &wait); 1291 /* 1292 * !plist_node_empty() is safe here without any lock. 1293 * q.lock_ptr != 0 is not safe, because of ordering against wakeup. 1294 */ 1295 if (likely(!plist_node_empty(&q.list))) { 1296 if (!abs_time) 1297 schedule(); 1298 else { 1299 unsigned long slack; 1300 slack = current->timer_slack_ns; 1301 if (rt_task(current)) 1302 slack = 0; 1303 hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, 1304 HRTIMER_MODE_ABS); 1305 hrtimer_init_sleeper(&t, current); 1306 hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack); 1307 1308 hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); 1309 if (!hrtimer_active(&t.timer)) 1310 t.task = NULL; 1311 1312 /* 1313 * the timer could have already expired, in which 1314 * case current would be flagged for rescheduling. 1315 * Don't bother calling schedule. 1316 */ 1317 if (likely(t.task)) 1318 schedule(); 1319 1320 hrtimer_cancel(&t.timer); 1321 1322 /* Flag if a timeout occured */ 1323 rem = (t.task == NULL); 1324 1325 destroy_hrtimer_on_stack(&t.timer); 1326 } 1327 } 1328 __set_current_state(TASK_RUNNING); 1329 1330 /* 1331 * NOTE: we don't remove ourselves from the waitqueue because 1332 * we are the only user of it. 1333 */ 1334 1335 /* If we were woken (and unqueued), we succeeded, whatever. */ 1336 if (!unqueue_me(&q)) 1337 return 0; 1338 if (rem) 1339 return -ETIMEDOUT; 1340 1341 /* 1342 * We expect signal_pending(current), but another thread may 1343 * have handled it for us already. 1344 */ 1345 if (!abs_time) 1346 return -ERESTARTSYS; 1347 else { 1348 struct restart_block *restart; 1349 restart = ¤t_thread_info()->restart_block; 1350 restart->fn = futex_wait_restart; 1351 restart->futex.uaddr = (u32 *)uaddr; 1352 restart->futex.val = val; 1353 restart->futex.time = abs_time->tv64; 1354 restart->futex.bitset = bitset; 1355 restart->futex.flags = 0; 1356 1357 if (fshared) 1358 restart->futex.flags |= FLAGS_SHARED; 1359 return -ERESTART_RESTARTBLOCK; 1360 } 1361 1362 out_unlock_release_sem: 1363 queue_unlock(&q, hb); 1364 1365 out_release_sem: 1366 futex_unlock_mm(fshared); 1367 return ret; 1368} 1369 1370 1371static long futex_wait_restart(struct restart_block *restart) 1372{ 1373 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; 1374 struct rw_semaphore *fshared = NULL; 1375 ktime_t t; 1376 1377 t.tv64 = restart->futex.time; 1378 restart->fn = do_no_restart_syscall; 1379 if (restart->futex.flags & FLAGS_SHARED) 1380 fshared = ¤t->mm->mmap_sem; 1381 return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, 1382 restart->futex.bitset); 1383} 1384 1385 1386/* 1387 * Userspace tried a 0 -> TID atomic transition of the futex value 1388 * and failed. The kernel side here does the whole locking operation: 1389 * if there are waiters then it will block, it does PI, etc. (Due to 1390 * races the kernel might see a 0 value of the futex too.) 1391 */ 1392static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, 1393 int detect, ktime_t *time, int trylock) 1394{ 1395 struct hrtimer_sleeper timeout, *to = NULL; 1396 struct task_struct *curr = current; 1397 struct futex_hash_bucket *hb; 1398 u32 uval, newval, curval; 1399 struct futex_q q; 1400 int ret, lock_taken, ownerdied = 0, attempt = 0; 1401 1402 if (refill_pi_state_cache()) 1403 return -ENOMEM; 1404 1405 if (time) { 1406 to = &timeout; 1407 hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, 1408 HRTIMER_MODE_ABS); 1409 hrtimer_init_sleeper(to, current); 1410 hrtimer_set_expires(&to->timer, *time); 1411 } 1412 1413 q.pi_state = NULL; 1414 retry: 1415 futex_lock_mm(fshared); 1416 1417 ret = get_futex_key(uaddr, fshared, &q.key); 1418 if (unlikely(ret != 0)) 1419 goto out_release_sem; 1420 1421 retry_unlocked: 1422 hb = queue_lock(&q); 1423 1424 retry_locked: 1425 ret = lock_taken = 0; 1426 1427 /* 1428 * To avoid races, we attempt to take the lock here again 1429 * (by doing a 0 -> TID atomic cmpxchg), while holding all 1430 * the locks. It will most likely not succeed. 1431 */ 1432 newval = task_pid_vnr(current); 1433 1434 curval = cmpxchg_futex_value_locked(uaddr, 0, newval); 1435 1436 if (unlikely(curval == -EFAULT)) 1437 goto uaddr_faulted; 1438 1439 /* 1440 * Detect deadlocks. In case of REQUEUE_PI this is a valid 1441 * situation and we return success to user space. 1442 */ 1443 if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) { 1444 ret = -EDEADLK; 1445 goto out_unlock_release_sem; 1446 } 1447 1448 /* 1449 * Surprise - we got the lock. Just return to userspace: 1450 */ 1451 if (unlikely(!curval)) 1452 goto out_unlock_release_sem; 1453 1454 uval = curval; 1455 1456 /* 1457 * Set the WAITERS flag, so the owner will know it has someone 1458 * to wake at next unlock 1459 */ 1460 newval = curval | FUTEX_WAITERS; 1461 1462 /* 1463 * There are two cases, where a futex might have no owner (the 1464 * owner TID is 0): OWNER_DIED. We take over the futex in this 1465 * case. We also do an unconditional take over, when the owner 1466 * of the futex died. 1467 * 1468 * This is safe as we are protected by the hash bucket lock ! 1469 */ 1470 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { 1471 /* Keep the OWNER_DIED bit */ 1472 newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current); 1473 ownerdied = 0; 1474 lock_taken = 1; 1475 } 1476 1477 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 1478 1479 if (unlikely(curval == -EFAULT)) 1480 goto uaddr_faulted; 1481 if (unlikely(curval != uval)) 1482 goto retry_locked; 1483 1484 /* 1485 * We took the lock due to owner died take over. 1486 */ 1487 if (unlikely(lock_taken)) 1488 goto out_unlock_release_sem; 1489 1490 /* 1491 * We dont have the lock. Look up the PI state (or create it if 1492 * we are the first waiter): 1493 */ 1494 ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state); 1495 1496 if (unlikely(ret)) { 1497 switch (ret) { 1498 1499 case -EAGAIN: 1500 /* 1501 * Task is exiting and we just wait for the 1502 * exit to complete. 1503 */ 1504 queue_unlock(&q, hb); 1505 futex_unlock_mm(fshared); 1506 cond_resched(); 1507 goto retry; 1508 1509 case -ESRCH: 1510 /* 1511 * No owner found for this futex. Check if the 1512 * OWNER_DIED bit is set to figure out whether 1513 * this is a robust futex or not. 1514 */ 1515 if (get_futex_value_locked(&curval, uaddr)) 1516 goto uaddr_faulted; 1517 1518 /* 1519 * We simply start over in case of a robust 1520 * futex. The code above will take the futex 1521 * and return happy. 1522 */ 1523 if (curval & FUTEX_OWNER_DIED) { 1524 ownerdied = 1; 1525 goto retry_locked; 1526 } 1527 default: 1528 goto out_unlock_release_sem; 1529 } 1530 } 1531 1532 /* 1533 * Only actually queue now that the atomic ops are done: 1534 */ 1535 queue_me(&q, hb); 1536 1537 /* 1538 * Now the futex is queued and we have checked the data, we 1539 * don't want to hold mmap_sem while we sleep. 1540 */ 1541 futex_unlock_mm(fshared); 1542 1543 WARN_ON(!q.pi_state); 1544 /* 1545 * Block on the PI mutex: 1546 */ 1547 if (!trylock) 1548 ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); 1549 else { 1550 ret = rt_mutex_trylock(&q.pi_state->pi_mutex); 1551 /* Fixup the trylock return value: */ 1552 ret = ret ? 0 : -EWOULDBLOCK; 1553 } 1554 1555 futex_lock_mm(fshared); 1556 spin_lock(q.lock_ptr); 1557 1558 if (!ret) { 1559 /* 1560 * Got the lock. We might not be the anticipated owner 1561 * if we did a lock-steal - fix up the PI-state in 1562 * that case: 1563 */ 1564 if (q.pi_state->owner != curr) 1565 ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); 1566 } else { 1567 /* 1568 * Catch the rare case, where the lock was released 1569 * when we were on the way back before we locked the 1570 * hash bucket. 1571 */ 1572 if (q.pi_state->owner == curr) { 1573 /* 1574 * Try to get the rt_mutex now. This might 1575 * fail as some other task acquired the 1576 * rt_mutex after we removed ourself from the 1577 * rt_mutex waiters list. 1578 */ 1579 if (rt_mutex_trylock(&q.pi_state->pi_mutex)) 1580 ret = 0; 1581 else { 1582 /* 1583 * pi_state is incorrect, some other 1584 * task did a lock steal and we 1585 * returned due to timeout or signal 1586 * without taking the rt_mutex. Too 1587 * late. We can access the 1588 * rt_mutex_owner without locking, as 1589 * the other task is now blocked on 1590 * the hash bucket lock. Fix the state 1591 * up. 1592 */ 1593 struct task_struct *owner; 1594 int res; 1595 1596 owner = rt_mutex_owner(&q.pi_state->pi_mutex); 1597 res = fixup_pi_state_owner(uaddr, &q, owner, 1598 fshared); 1599 1600 /* propagate -EFAULT, if the fixup failed */ 1601 if (res) 1602 ret = res; 1603 } 1604 } else { 1605 /* 1606 * Paranoia check. If we did not take the lock 1607 * in the trylock above, then we should not be 1608 * the owner of the rtmutex, neither the real 1609 * nor the pending one: 1610 */ 1611 if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr) 1612 printk(KERN_ERR "futex_lock_pi: ret = %d " 1613 "pi-mutex: %p pi-state %p\n", ret, 1614 q.pi_state->pi_mutex.owner, 1615 q.pi_state->owner); 1616 } 1617 } 1618 1619 /* Unqueue and drop the lock */ 1620 unqueue_me_pi(&q); 1621 futex_unlock_mm(fshared); 1622 1623 if (to) 1624 destroy_hrtimer_on_stack(&to->timer); 1625 return ret != -EINTR ? ret : -ERESTARTNOINTR; 1626 1627 out_unlock_release_sem: 1628 queue_unlock(&q, hb); 1629 1630 out_release_sem: 1631 futex_unlock_mm(fshared); 1632 if (to) 1633 destroy_hrtimer_on_stack(&to->timer); 1634 return ret; 1635 1636 uaddr_faulted: 1637 /* 1638 * We have to r/w *(int __user *)uaddr, but we can't modify it 1639 * non-atomically. Therefore, if get_user below is not 1640 * enough, we need to handle the fault ourselves, while 1641 * still holding the mmap_sem. 1642 * 1643 * ... and hb->lock. :-) --ANK 1644 */ 1645 queue_unlock(&q, hb); 1646 1647 if (attempt++) { 1648 ret = futex_handle_fault((unsigned long)uaddr, fshared, 1649 attempt); 1650 if (ret) 1651 goto out_release_sem; 1652 goto retry_unlocked; 1653 } 1654 1655 futex_unlock_mm(fshared); 1656 1657 ret = get_user(uval, uaddr); 1658 if (!ret && (uval != -EFAULT)) 1659 goto retry; 1660 1661 if (to) 1662 destroy_hrtimer_on_stack(&to->timer); 1663 return ret; 1664} 1665 1666/* 1667 * Userspace attempted a TID -> 0 atomic transition, and failed. 1668 * This is the in-kernel slowpath: we look up the PI state (if any), 1669 * and do the rt-mutex unlock. 1670 */ 1671static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared) 1672{ 1673 struct futex_hash_bucket *hb; 1674 struct futex_q *this, *next; 1675 u32 uval; 1676 struct plist_head *head; 1677 union futex_key key; 1678 int ret, attempt = 0; 1679 1680retry: 1681 if (get_user(uval, uaddr)) 1682 return -EFAULT; 1683 /* 1684 * We release only a lock we actually own: 1685 */ 1686 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 1687 return -EPERM; 1688 /* 1689 * First take all the futex related locks: 1690 */ 1691 futex_lock_mm(fshared); 1692 1693 ret = get_futex_key(uaddr, fshared, &key); 1694 if (unlikely(ret != 0)) 1695 goto out; 1696 1697 hb = hash_futex(&key); 1698retry_unlocked: 1699 spin_lock(&hb->lock); 1700 1701 /* 1702 * To avoid races, try to do the TID -> 0 atomic transition 1703 * again. If it succeeds then we can return without waking 1704 * anyone else up: 1705 */ 1706 if (!(uval & FUTEX_OWNER_DIED)) 1707 uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0); 1708 1709 1710 if (unlikely(uval == -EFAULT)) 1711 goto pi_faulted; 1712 /* 1713 * Rare case: we managed to release the lock atomically, 1714 * no need to wake anyone else up: 1715 */ 1716 if (unlikely(uval == task_pid_vnr(current))) 1717 goto out_unlock; 1718 1719 /* 1720 * Ok, other tasks may need to be woken up - check waiters 1721 * and do the wakeup if necessary: 1722 */ 1723 head = &hb->chain; 1724 1725 plist_for_each_entry_safe(this, next, head, list) { 1726 if (!match_futex (&this->key, &key)) 1727 continue; 1728 ret = wake_futex_pi(uaddr, uval, this); 1729 /* 1730 * The atomic access to the futex value 1731 * generated a pagefault, so retry the 1732 * user-access and the wakeup: 1733 */ 1734 if (ret == -EFAULT) 1735 goto pi_faulted; 1736 goto out_unlock; 1737 } 1738 /* 1739 * No waiters - kernel unlocks the futex: 1740 */ 1741 if (!(uval & FUTEX_OWNER_DIED)) { 1742 ret = unlock_futex_pi(uaddr, uval); 1743 if (ret == -EFAULT) 1744 goto pi_faulted; 1745 } 1746 1747out_unlock: 1748 spin_unlock(&hb->lock); 1749out: 1750 futex_unlock_mm(fshared); 1751 1752 return ret; 1753 1754pi_faulted: 1755 /* 1756 * We have to r/w *(int __user *)uaddr, but we can't modify it 1757 * non-atomically. Therefore, if get_user below is not 1758 * enough, we need to handle the fault ourselves, while 1759 * still holding the mmap_sem. 1760 * 1761 * ... and hb->lock. --ANK 1762 */ 1763 spin_unlock(&hb->lock); 1764 1765 if (attempt++) { 1766 ret = futex_handle_fault((unsigned long)uaddr, fshared, 1767 attempt); 1768 if (ret) 1769 goto out; 1770 uval = 0; 1771 goto retry_unlocked; 1772 } 1773 1774 futex_unlock_mm(fshared); 1775 1776 ret = get_user(uval, uaddr); 1777 if (!ret && (uval != -EFAULT)) 1778 goto retry; 1779 1780 return ret; 1781} 1782 1783/* 1784 * Support for robust futexes: the kernel cleans up held futexes at 1785 * thread exit time. 1786 * 1787 * Implementation: user-space maintains a per-thread list of locks it 1788 * is holding. Upon do_exit(), the kernel carefully walks this list, 1789 * and marks all locks that are owned by this thread with the 1790 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is 1791 * always manipulated with the lock held, so the list is private and 1792 * per-thread. Userspace also maintains a per-thread 'list_op_pending' 1793 * field, to allow the kernel to clean up if the thread dies after 1794 * acquiring the lock, but just before it could have added itself to 1795 * the list. There can only be one such pending lock. 1796 */ 1797 1798/** 1799 * sys_set_robust_list - set the robust-futex list head of a task 1800 * @head: pointer to the list-head 1801 * @len: length of the list-head, as userspace expects 1802 */ 1803asmlinkage long 1804sys_set_robust_list(struct robust_list_head __user *head, 1805 size_t len) 1806{ 1807 if (!futex_cmpxchg_enabled) 1808 return -ENOSYS; 1809 /* 1810 * The kernel knows only one size for now: 1811 */ 1812 if (unlikely(len != sizeof(*head))) 1813 return -EINVAL; 1814 1815 current->robust_list = head; 1816 1817 return 0; 1818} 1819 1820/** 1821 * sys_get_robust_list - get the robust-futex list head of a task 1822 * @pid: pid of the process [zero for current task] 1823 * @head_ptr: pointer to a list-head pointer, the kernel fills it in 1824 * @len_ptr: pointer to a length field, the kernel fills in the header size 1825 */ 1826asmlinkage long 1827sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, 1828 size_t __user *len_ptr) 1829{ 1830 struct robust_list_head __user *head; 1831 unsigned long ret; 1832 1833 if (!futex_cmpxchg_enabled) 1834 return -ENOSYS; 1835 1836 if (!pid) 1837 head = current->robust_list; 1838 else { 1839 struct task_struct *p; 1840 1841 ret = -ESRCH; 1842 rcu_read_lock(); 1843 p = find_task_by_vpid(pid); 1844 if (!p) 1845 goto err_unlock; 1846 ret = -EPERM; 1847 if ((current->euid != p->euid) && (current->euid != p->uid) && 1848 !capable(CAP_SYS_PTRACE)) 1849 goto err_unlock; 1850 head = p->robust_list; 1851 rcu_read_unlock(); 1852 } 1853 1854 if (put_user(sizeof(*head), len_ptr)) 1855 return -EFAULT; 1856 return put_user(head, head_ptr); 1857 1858err_unlock: 1859 rcu_read_unlock(); 1860 1861 return ret; 1862} 1863 1864/* 1865 * Process a futex-list entry, check whether it's owned by the 1866 * dying task, and do notification if so: 1867 */ 1868int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) 1869{ 1870 u32 uval, nval, mval; 1871 1872retry: 1873 if (get_user(uval, uaddr)) 1874 return -1; 1875 1876 if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) { 1877 /* 1878 * Ok, this dying thread is truly holding a futex 1879 * of interest. Set the OWNER_DIED bit atomically 1880 * via cmpxchg, and if the value had FUTEX_WAITERS 1881 * set, wake up a waiter (if any). (We have to do a 1882 * futex_wake() even if OWNER_DIED is already set - 1883 * to handle the rare but possible case of recursive 1884 * thread-death.) The rest of the cleanup is done in 1885 * userspace. 1886 */ 1887 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 1888 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); 1889 1890 if (nval == -EFAULT) 1891 return -1; 1892 1893 if (nval != uval) 1894 goto retry; 1895 1896 /* 1897 * Wake robust non-PI futexes here. The wakeup of 1898 * PI futexes happens in exit_pi_state(): 1899 */ 1900 if (!pi && (uval & FUTEX_WAITERS)) 1901 futex_wake(uaddr, &curr->mm->mmap_sem, 1, 1902 FUTEX_BITSET_MATCH_ANY); 1903 } 1904 return 0; 1905} 1906 1907/* 1908 * Fetch a robust-list pointer. Bit 0 signals PI futexes: 1909 */ 1910static inline int fetch_robust_entry(struct robust_list __user **entry, 1911 struct robust_list __user * __user *head, 1912 int *pi) 1913{ 1914 unsigned long uentry; 1915 1916 if (get_user(uentry, (unsigned long __user *)head)) 1917 return -EFAULT; 1918 1919 *entry = (void __user *)(uentry & ~1UL); 1920 *pi = uentry & 1; 1921 1922 return 0; 1923} 1924 1925/* 1926 * Walk curr->robust_list (very carefully, it's a userspace list!) 1927 * and mark any locks found there dead, and notify any waiters. 1928 * 1929 * We silently return on any sign of list-walking problem. 1930 */ 1931void exit_robust_list(struct task_struct *curr) 1932{ 1933 struct robust_list_head __user *head = curr->robust_list; 1934 struct robust_list __user *entry, *next_entry, *pending; 1935 unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; 1936 unsigned long futex_offset; 1937 int rc; 1938 1939 if (!futex_cmpxchg_enabled) 1940 return; 1941 1942 /* 1943 * Fetch the list head (which was registered earlier, via 1944 * sys_set_robust_list()): 1945 */ 1946 if (fetch_robust_entry(&entry, &head->list.next, &pi)) 1947 return; 1948 /* 1949 * Fetch the relative futex offset: 1950 */ 1951 if (get_user(futex_offset, &head->futex_offset)) 1952 return; 1953 /* 1954 * Fetch any possibly pending lock-add first, and handle it 1955 * if it exists: 1956 */ 1957 if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) 1958 return; 1959 1960 next_entry = NULL; /* avoid warning with gcc */ 1961 while (entry != &head->list) { 1962 /* 1963 * Fetch the next entry in the list before calling 1964 * handle_futex_death: 1965 */ 1966 rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); 1967 /* 1968 * A pending lock might already be on the list, so 1969 * don't process it twice: 1970 */ 1971 if (entry != pending) 1972 if (handle_futex_death((void __user *)entry + futex_offset, 1973 curr, pi)) 1974 return; 1975 if (rc) 1976 return; 1977 entry = next_entry; 1978 pi = next_pi; 1979 /* 1980 * Avoid excessively long or circular lists: 1981 */ 1982 if (!--limit) 1983 break; 1984 1985 cond_resched(); 1986 } 1987 1988 if (pending) 1989 handle_futex_death((void __user *)pending + futex_offset, 1990 curr, pip); 1991} 1992 1993long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 1994 u32 __user *uaddr2, u32 val2, u32 val3) 1995{ 1996 int ret = -ENOSYS; 1997 int cmd = op & FUTEX_CMD_MASK; 1998 struct rw_semaphore *fshared = NULL; 1999 2000 if (!(op & FUTEX_PRIVATE_FLAG)) 2001 fshared = ¤t->mm->mmap_sem; 2002 2003 switch (cmd) { 2004 case FUTEX_WAIT: 2005 val3 = FUTEX_BITSET_MATCH_ANY; 2006 case FUTEX_WAIT_BITSET: 2007 ret = futex_wait(uaddr, fshared, val, timeout, val3); 2008 break; 2009 case FUTEX_WAKE: 2010 val3 = FUTEX_BITSET_MATCH_ANY; 2011 case FUTEX_WAKE_BITSET: 2012 ret = futex_wake(uaddr, fshared, val, val3); 2013 break; 2014 case FUTEX_REQUEUE: 2015 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); 2016 break; 2017 case FUTEX_CMP_REQUEUE: 2018 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3); 2019 break; 2020 case FUTEX_WAKE_OP: 2021 ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); 2022 break; 2023 case FUTEX_LOCK_PI: 2024 if (futex_cmpxchg_enabled) 2025 ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); 2026 break; 2027 case FUTEX_UNLOCK_PI: 2028 if (futex_cmpxchg_enabled) 2029 ret = futex_unlock_pi(uaddr, fshared); 2030 break; 2031 case FUTEX_TRYLOCK_PI: 2032 if (futex_cmpxchg_enabled) 2033 ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); 2034 break; 2035 default: 2036 ret = -ENOSYS; 2037 } 2038 return ret; 2039} 2040 2041 2042asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, 2043 struct timespec __user *utime, u32 __user *uaddr2, 2044 u32 val3) 2045{ 2046 struct timespec ts; 2047 ktime_t t, *tp = NULL; 2048 u32 val2 = 0; 2049 int cmd = op & FUTEX_CMD_MASK; 2050 2051 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || 2052 cmd == FUTEX_WAIT_BITSET)) { 2053 if (copy_from_user(&ts, utime, sizeof(ts)) != 0) 2054 return -EFAULT; 2055 if (!timespec_valid(&ts)) 2056 return -EINVAL; 2057 2058 t = timespec_to_ktime(ts); 2059 if (cmd == FUTEX_WAIT) 2060 t = ktime_add_safe(ktime_get(), t); 2061 tp = &t; 2062 } 2063 /* 2064 * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. 2065 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. 2066 */ 2067 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || 2068 cmd == FUTEX_WAKE_OP) 2069 val2 = (u32) (unsigned long) utime; 2070 2071 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 2072} 2073 2074static int __init futex_init(void) 2075{ 2076 u32 curval; 2077 int i; 2078 2079 /* 2080 * This will fail and we want it. Some arch implementations do 2081 * runtime detection of the futex_atomic_cmpxchg_inatomic() 2082 * functionality. We want to know that before we call in any 2083 * of the complex code paths. Also we want to prevent 2084 * registration of robust lists in that case. NULL is 2085 * guaranteed to fault and we get -EFAULT on functional 2086 * implementation, the non functional ones will return 2087 * -ENOSYS. 2088 */ 2089 curval = cmpxchg_futex_value_locked(NULL, 0, 0); 2090 if (curval == -EFAULT) 2091 futex_cmpxchg_enabled = 1; 2092 2093 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { 2094 plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); 2095 spin_lock_init(&futex_queues[i].lock); 2096 } 2097 2098 return 0; 2099} 2100__initcall(futex_init);