001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.shortcircuit;
019
020import java.io.BufferedOutputStream;
021import java.io.Closeable;
022import java.io.DataInputStream;
023import java.io.DataOutputStream;
024import java.io.IOException;
025import java.nio.MappedByteBuffer;
026import java.util.HashMap;
027import java.util.Map;
028import java.util.Map.Entry;
029import java.util.TreeMap;
030import java.util.concurrent.ScheduledFuture;
031import java.util.concurrent.ScheduledThreadPoolExecutor;
032import java.util.concurrent.TimeUnit;
033import java.util.concurrent.locks.Condition;
034import java.util.concurrent.locks.ReentrantLock;
035
036import org.apache.commons.lang.mutable.MutableBoolean;
037import org.apache.commons.logging.Log;
038import org.apache.commons.logging.LogFactory;
039import org.apache.hadoop.classification.InterfaceAudience;
040import org.apache.hadoop.conf.Configuration;
041import org.apache.hadoop.hdfs.DFSConfigKeys;
042import org.apache.hadoop.hdfs.ExtendedBlockId;
043import org.apache.hadoop.hdfs.net.DomainPeer;
044import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
045import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
046import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ReleaseShortCircuitAccessResponseProto;
047import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status;
048import org.apache.hadoop.hdfs.protocolPB.PBHelper;
049import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.Slot;
050import org.apache.hadoop.io.IOUtils;
051import org.apache.hadoop.ipc.RetriableException;
052import org.apache.hadoop.net.unix.DomainSocket;
053import org.apache.hadoop.net.unix.DomainSocketWatcher;
054import org.apache.hadoop.security.token.SecretManager.InvalidToken;
055import org.apache.hadoop.util.StringUtils;
056import org.apache.hadoop.util.Time;
057import org.apache.hadoop.util.Waitable;
058
059import com.google.common.annotations.VisibleForTesting;
060import com.google.common.base.Preconditions;
061import com.google.common.util.concurrent.ThreadFactoryBuilder;
062
063/**
064 * The ShortCircuitCache tracks things which the client needs to access
065 * HDFS block files via short-circuit.
066 *
067 * These things include: memory-mapped regions, file descriptors, and shared
068 * memory areas for communicating with the DataNode.
069 */
070@InterfaceAudience.Private
071public class ShortCircuitCache implements Closeable {
072  public static final Log LOG = LogFactory.getLog(ShortCircuitCache.class);
073
074  /**
075   * Expiry thread which makes sure that the file descriptors get closed
076   * after a while.
077   */
078  private class CacheCleaner implements Runnable, Closeable {
079    private ScheduledFuture<?> future;
080
081    /**
082     * Run the CacheCleaner thread.
083     *
084     * Whenever a thread requests a ShortCircuitReplica object, we will make
085     * sure it gets one.  That ShortCircuitReplica object can then be re-used
086     * when another thread requests a ShortCircuitReplica object for the same
087     * block.  So in that sense, there is no maximum size to the cache.
088     *
089     * However, when a ShortCircuitReplica object is unreferenced by the
090     * thread(s) that are using it, it becomes evictable.  There are two
091     * separate eviction lists-- one for mmaped objects, and another for
092     * non-mmaped objects.  We do this in order to avoid having the regular
093     * files kick the mmaped files out of the cache too quickly.  Reusing
094     * an already-existing mmap gives a huge performance boost, since the
095     * page table entries don't have to be re-populated.  Both the mmap
096     * and non-mmap evictable lists have maximum sizes and maximum lifespans.
097     */
098    @Override
099    public void run() {
100      ShortCircuitCache.this.lock.lock();
101      try {
102        if (ShortCircuitCache.this.closed) return;
103        long curMs = Time.monotonicNow();
104
105        if (LOG.isDebugEnabled()) {
106          LOG.debug(this + ": cache cleaner running at " + curMs);
107        }
108
109        int numDemoted = demoteOldEvictableMmaped(curMs);
110        int numPurged = 0;
111        Long evictionTimeNs = Long.valueOf(0);
112        while (true) {
113          Entry<Long, ShortCircuitReplica> entry = 
114              evictable.ceilingEntry(evictionTimeNs);
115          if (entry == null) break;
116          evictionTimeNs = entry.getKey();
117          long evictionTimeMs = 
118              TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS);
119          if (evictionTimeMs + maxNonMmappedEvictableLifespanMs >= curMs) break;
120          ShortCircuitReplica replica = entry.getValue();
121          if (LOG.isTraceEnabled()) {
122            LOG.trace("CacheCleaner: purging " + replica + ": " + 
123                  StringUtils.getStackTrace(Thread.currentThread()));
124          }
125          purge(replica);
126          numPurged++;
127        }
128
129        if (LOG.isDebugEnabled()) {
130          LOG.debug(this + ": finishing cache cleaner run started at " +
131            curMs + ".  Demoted " + numDemoted + " mmapped replicas; " +
132            "purged " + numPurged + " replicas.");
133        }
134      } finally {
135        ShortCircuitCache.this.lock.unlock();
136      }
137    }
138
139    @Override
140    public void close() throws IOException {
141      if (future != null) {
142        future.cancel(false);
143      }
144    }
145
146    public void setFuture(ScheduledFuture<?> future) {
147      this.future = future;
148    }
149
150    /**
151     * Get the rate at which this cleaner thread should be scheduled.
152     *
153     * We do this by taking the minimum expiration time and dividing by 4.
154     *
155     * @return the rate in milliseconds at which this thread should be
156     *         scheduled.
157     */
158    public long getRateInMs() {
159      long minLifespanMs =
160          Math.min(maxNonMmappedEvictableLifespanMs,
161              maxEvictableMmapedLifespanMs);
162      long sampleTimeMs = minLifespanMs / 4;
163      return (sampleTimeMs < 1) ? 1 : sampleTimeMs;
164    }
165  }
166
167  /**
168   * A task which asks the DataNode to release a short-circuit shared memory
169   * slot.  If successful, this will tell the DataNode to stop monitoring
170   * changes to the mlock status of the replica associated with the slot.
171   * It will also allow us (the client) to re-use this slot for another
172   * replica.  If we can't communicate with the DataNode for some reason,
173   * we tear down the shared memory segment to avoid being in an inconsistent
174   * state.
175   */
176  private class SlotReleaser implements Runnable {
177    /**
178     * The slot that we need to release.
179     */
180    private final Slot slot;
181
182    SlotReleaser(Slot slot) {
183      this.slot = slot;
184    }
185
186    @Override
187    public void run() {
188      if (LOG.isTraceEnabled()) {
189        LOG.trace(ShortCircuitCache.this + ": about to release " + slot);
190      }
191      final DfsClientShm shm = (DfsClientShm)slot.getShm();
192      final DomainSocket shmSock = shm.getPeer().getDomainSocket();
193      DomainSocket sock = null;
194      DataOutputStream out = null;
195      final String path = shmSock.getPath();
196      boolean success = false;
197      try {
198        sock = DomainSocket.connect(path);
199        out = new DataOutputStream(
200            new BufferedOutputStream(sock.getOutputStream()));
201        new Sender(out).releaseShortCircuitFds(slot.getSlotId());
202        DataInputStream in = new DataInputStream(sock.getInputStream());
203        ReleaseShortCircuitAccessResponseProto resp =
204            ReleaseShortCircuitAccessResponseProto.parseFrom(
205                PBHelper.vintPrefixed(in));
206        if (resp.getStatus() != Status.SUCCESS) {
207          String error = resp.hasError() ? resp.getError() : "(unknown)";
208          throw new IOException(resp.getStatus().toString() + ": " + error);
209        }
210        if (LOG.isTraceEnabled()) {
211          LOG.trace(ShortCircuitCache.this + ": released " + slot);
212        }
213        success = true;
214      } catch (IOException e) {
215        LOG.error(ShortCircuitCache.this + ": failed to release " +
216            "short-circuit shared memory slot " + slot + " by sending " +
217            "ReleaseShortCircuitAccessRequestProto to " + path +
218            ".  Closing shared memory segment.", e);
219      } finally {
220        if (success) {
221          shmManager.freeSlot(slot);
222        } else {
223          shm.getEndpointShmManager().shutdown(shm);
224        }
225        IOUtils.cleanup(LOG, sock, out);
226      }
227    }
228  }
229
230  public interface ShortCircuitReplicaCreator {
231    /**
232     * Attempt to create a ShortCircuitReplica object.
233     *
234     * This callback will be made without holding any locks.
235     *
236     * @return a non-null ShortCircuitReplicaInfo object.
237     */
238    ShortCircuitReplicaInfo createShortCircuitReplicaInfo();
239  }
240
241  /**
242   * Lock protecting the cache.
243   */
244  private final ReentrantLock lock = new ReentrantLock();
245
246  /**
247   * The executor service that runs the cacheCleaner.
248   */
249  private final ScheduledThreadPoolExecutor cleanerExecutor
250  = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder().
251          setDaemon(true).setNameFormat("ShortCircuitCache_Cleaner").
252          build());
253
254  /**
255   * The executor service that runs the cacheCleaner.
256   */
257  private final ScheduledThreadPoolExecutor releaserExecutor
258      = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder().
259          setDaemon(true).setNameFormat("ShortCircuitCache_SlotReleaser").
260          build());
261
262  /**
263   * A map containing all ShortCircuitReplicaInfo objects, organized by Key.
264   * ShortCircuitReplicaInfo objects may contain a replica, or an InvalidToken
265   * exception.
266   */
267  private final HashMap<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>> 
268      replicaInfoMap = new HashMap<ExtendedBlockId,
269          Waitable<ShortCircuitReplicaInfo>>();
270
271  /**
272   * The CacheCleaner.  We don't create this and schedule it until it becomes
273   * necessary.
274   */
275  private CacheCleaner cacheCleaner;
276
277  /**
278   * Tree of evictable elements.
279   *
280   * Maps (unique) insertion time in nanoseconds to the element.
281   */
282  private final TreeMap<Long, ShortCircuitReplica> evictable =
283      new TreeMap<Long, ShortCircuitReplica>();
284
285  /**
286   * Maximum total size of the cache, including both mmapped and
287   * no$-mmapped elements.
288   */
289  private final int maxTotalSize;
290
291  /**
292   * Non-mmaped elements older than this will be closed.
293   */
294  private long maxNonMmappedEvictableLifespanMs;
295
296  /**
297   * Tree of mmaped evictable elements.
298   *
299   * Maps (unique) insertion time in nanoseconds to the element.
300   */
301  private final TreeMap<Long, ShortCircuitReplica> evictableMmapped =
302      new TreeMap<Long, ShortCircuitReplica>();
303
304  /**
305   * Maximum number of mmaped evictable elements.
306   */
307  private int maxEvictableMmapedSize;
308
309  /**
310   * Mmaped elements older than this will be closed.
311   */
312  private final long maxEvictableMmapedLifespanMs;
313
314  /**
315   * The minimum number of milliseconds we'll wait after an unsuccessful
316   * mmap attempt before trying again.
317   */
318  private final long mmapRetryTimeoutMs;
319
320  /**
321   * How long we will keep replicas in the cache before declaring them
322   * to be stale.
323   */
324  private final long staleThresholdMs;
325
326  /**
327   * True if the ShortCircuitCache is closed.
328   */
329  private boolean closed = false;
330
331  /**
332   * Number of existing mmaps associated with this cache.
333   */
334  private int outstandingMmapCount = 0;
335
336  /**
337   * Manages short-circuit shared memory segments for the client.
338   */
339  private final DfsClientShmManager shmManager;
340
341  /**
342   * Create a {@link ShortCircuitCache} object from a {@link Configuration}
343   */
344  public static ShortCircuitCache fromConf(Configuration conf) {
345    return new ShortCircuitCache(
346        conf.getInt(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_KEY,
347            DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_DEFAULT),
348        conf.getLong(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_KEY,
349            DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_DEFAULT),
350        conf.getInt(DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE,
351            DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT),
352        conf.getLong(DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS,
353            DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT),
354        conf.getLong(DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS,
355            DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS_DEFAULT),
356        conf.getLong(DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS,
357            DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS_DEFAULT),
358        conf.getInt(DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS,
359            DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT));
360  }
361
362  public ShortCircuitCache(int maxTotalSize, long maxNonMmappedEvictableLifespanMs,
363      int maxEvictableMmapedSize, long maxEvictableMmapedLifespanMs,
364      long mmapRetryTimeoutMs, long staleThresholdMs, int shmInterruptCheckMs) {
365    Preconditions.checkArgument(maxTotalSize >= 0);
366    this.maxTotalSize = maxTotalSize;
367    Preconditions.checkArgument(maxNonMmappedEvictableLifespanMs >= 0);
368    this.maxNonMmappedEvictableLifespanMs = maxNonMmappedEvictableLifespanMs;
369    Preconditions.checkArgument(maxEvictableMmapedSize >= 0);
370    this.maxEvictableMmapedSize = maxEvictableMmapedSize;
371    Preconditions.checkArgument(maxEvictableMmapedLifespanMs >= 0);
372    this.maxEvictableMmapedLifespanMs = maxEvictableMmapedLifespanMs;
373    this.mmapRetryTimeoutMs = mmapRetryTimeoutMs;
374    this.staleThresholdMs = staleThresholdMs;
375    DfsClientShmManager shmManager = null;
376    if ((shmInterruptCheckMs > 0) &&
377        (DomainSocketWatcher.getLoadingFailureReason() == null)) {
378      try {
379        shmManager = new DfsClientShmManager(shmInterruptCheckMs);
380      } catch (IOException e) {
381        LOG.error("failed to create ShortCircuitShmManager", e);
382      }
383    }
384    this.shmManager = shmManager;
385  }
386
387  public long getStaleThresholdMs() {
388    return staleThresholdMs;
389  }
390
391  /**
392   * Increment the reference count of a replica, and remove it from any free
393   * list it may be in.
394   *
395   * You must hold the cache lock while calling this function.
396   *
397   * @param replica      The replica we're removing.
398   */
399  private void ref(ShortCircuitReplica replica) {
400    lock.lock();
401    try {
402      Preconditions.checkArgument(replica.refCount > 0,
403          "can't ref " + replica + " because its refCount reached " +
404          replica.refCount);
405      Long evictableTimeNs = replica.getEvictableTimeNs();
406      replica.refCount++;
407      if (evictableTimeNs != null) {
408        String removedFrom = removeEvictable(replica);
409        if (LOG.isTraceEnabled()) {
410          LOG.trace(this + ": " + removedFrom +
411              " no longer contains " + replica + ".  refCount " +
412              (replica.refCount - 1) + " -> " + replica.refCount +
413              StringUtils.getStackTrace(Thread.currentThread()));
414
415        }
416      } else if (LOG.isTraceEnabled()) {
417        LOG.trace(this + ": replica  refCount " +
418            (replica.refCount - 1) + " -> " + replica.refCount +
419            StringUtils.getStackTrace(Thread.currentThread()));
420      }
421    } finally {
422      lock.unlock();
423    }
424  }
425
426  /**
427   * Unreference a replica.
428   *
429   * You must hold the cache lock while calling this function.
430   *
431   * @param replica   The replica being unreferenced.
432   */
433  void unref(ShortCircuitReplica replica) {
434    lock.lock();
435    try {
436      // If the replica is stale or unusable, but we haven't purged it yet,
437      // let's do that.  It would be a shame to evict a non-stale replica so
438      // that we could put a stale or unusable one into the cache.
439      if (!replica.purged) {
440        String purgeReason = null;
441        if (!replica.getDataStream().getChannel().isOpen()) {
442          purgeReason = "purging replica because its data channel is closed.";
443        } else if (!replica.getMetaStream().getChannel().isOpen()) {
444          purgeReason = "purging replica because its meta channel is closed.";
445        } else if (replica.isStale()) {
446          purgeReason = "purging replica because it is stale.";
447        }
448        if (purgeReason != null) {
449          LOG.debug(this + ": " + purgeReason);
450          purge(replica);
451        }
452      }
453      String addedString = "";
454      boolean shouldTrimEvictionMaps = false;
455      int newRefCount = --replica.refCount;
456      if (newRefCount == 0) {
457        // Close replica, since there are no remaining references to it.
458        Preconditions.checkArgument(replica.purged,
459            "Replica " + replica + " reached a refCount of 0 without " +
460            "being purged");
461        replica.close();
462      } else if (newRefCount == 1) {
463        Preconditions.checkState(null == replica.getEvictableTimeNs(),
464            "Replica " + replica + " had a refCount higher than 1, " +
465              "but was still evictable (evictableTimeNs = " +
466                replica.getEvictableTimeNs() + ")");
467        if (!replica.purged) {
468          // Add the replica to the end of an eviction list.
469          // Eviction lists are sorted by time.
470          if (replica.hasMmap()) {
471            insertEvictable(System.nanoTime(), replica, evictableMmapped);
472            addedString = "added to evictableMmapped, ";
473          } else {
474            insertEvictable(System.nanoTime(), replica, evictable);
475            addedString = "added to evictable, ";
476          }
477          shouldTrimEvictionMaps = true;
478        }
479      } else {
480        Preconditions.checkArgument(replica.refCount >= 0,
481            "replica's refCount went negative (refCount = " +
482            replica.refCount + " for " + replica + ")");
483      }
484      if (LOG.isTraceEnabled()) {
485        LOG.trace(this + ": unref replica " + replica +
486            ": " + addedString + " refCount " +
487            (newRefCount + 1) + " -> " + newRefCount +
488            StringUtils.getStackTrace(Thread.currentThread()));
489      }
490      if (shouldTrimEvictionMaps) {
491        trimEvictionMaps();
492      }
493    } finally {
494      lock.unlock();
495    }
496  }
497
498  /**
499   * Demote old evictable mmaps into the regular eviction map.
500   *
501   * You must hold the cache lock while calling this function.
502   *
503   * @param now   Current time in monotonic milliseconds.
504   * @return      Number of replicas demoted.
505   */
506  private int demoteOldEvictableMmaped(long now) {
507    int numDemoted = 0;
508    boolean needMoreSpace = false;
509    Long evictionTimeNs = Long.valueOf(0);
510
511    while (true) {
512      Entry<Long, ShortCircuitReplica> entry = 
513          evictableMmapped.ceilingEntry(evictionTimeNs);
514      if (entry == null) break;
515      evictionTimeNs = entry.getKey();
516      long evictionTimeMs = 
517          TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS);
518      if (evictionTimeMs + maxEvictableMmapedLifespanMs >= now) {
519        if (evictableMmapped.size() < maxEvictableMmapedSize) {
520          break;
521        }
522        needMoreSpace = true;
523      }
524      ShortCircuitReplica replica = entry.getValue();
525      if (LOG.isTraceEnabled()) {
526        String rationale = needMoreSpace ? "because we need more space" : 
527            "because it's too old";
528        LOG.trace("demoteOldEvictable: demoting " + replica + ": " +
529            rationale + ": " +
530            StringUtils.getStackTrace(Thread.currentThread()));
531      }
532      removeEvictable(replica, evictableMmapped);
533      munmap(replica);
534      insertEvictable(evictionTimeNs, replica, evictable);
535      numDemoted++;
536    }
537    return numDemoted;
538  }
539
540  /**
541   * Trim the eviction lists.
542   */
543  private void trimEvictionMaps() {
544    long now = Time.monotonicNow();
545    demoteOldEvictableMmaped(now);
546
547    while (true) {
548      long evictableSize = evictable.size();
549      long evictableMmappedSize = evictableMmapped.size();
550      if (evictableSize + evictableMmappedSize <= maxTotalSize) {
551        return;
552      }
553      ShortCircuitReplica replica;
554      if (evictableSize == 0) {
555       replica = evictableMmapped.firstEntry().getValue();
556      } else {
557       replica = evictable.firstEntry().getValue();
558      }
559      if (LOG.isTraceEnabled()) {
560        LOG.trace(this + ": trimEvictionMaps is purging " + replica +
561          StringUtils.getStackTrace(Thread.currentThread()));
562      }
563      purge(replica);
564    }
565  }
566
567  /**
568   * Munmap a replica, updating outstandingMmapCount.
569   *
570   * @param replica  The replica to munmap.
571   */
572  private void munmap(ShortCircuitReplica replica) {
573    replica.munmap();
574    outstandingMmapCount--;
575  }
576
577  /**
578   * Remove a replica from an evictable map.
579   *
580   * @param replica   The replica to remove.
581   * @return          The map it was removed from.
582   */
583  private String removeEvictable(ShortCircuitReplica replica) {
584    if (replica.hasMmap()) {
585      removeEvictable(replica, evictableMmapped);
586      return "evictableMmapped";
587    } else {
588      removeEvictable(replica, evictable);
589      return "evictable";
590    }
591  }
592
593  /**
594   * Remove a replica from an evictable map.
595   *
596   * @param replica   The replica to remove.
597   * @param map       The map to remove it from.
598   */
599  private void removeEvictable(ShortCircuitReplica replica,
600      TreeMap<Long, ShortCircuitReplica> map) {
601    Long evictableTimeNs = replica.getEvictableTimeNs();
602    Preconditions.checkNotNull(evictableTimeNs);
603    ShortCircuitReplica removed = map.remove(evictableTimeNs);
604    Preconditions.checkState(removed == replica,
605        "failed to make " + replica + " unevictable");
606    replica.setEvictableTimeNs(null);
607  }
608
609  /**
610   * Insert a replica into an evictable map.
611   *
612   * If an element already exists with this eviction time, we add a nanosecond
613   * to it until we find an unused key.
614   *
615   * @param evictionTimeNs   The eviction time in absolute nanoseconds.
616   * @param replica          The replica to insert.
617   * @param map              The map to insert it into.
618   */
619  private void insertEvictable(Long evictionTimeNs,
620      ShortCircuitReplica replica, TreeMap<Long, ShortCircuitReplica> map) {
621    while (map.containsKey(evictionTimeNs)) {
622      evictionTimeNs++;
623    }
624    Preconditions.checkState(null == replica.getEvictableTimeNs());
625    replica.setEvictableTimeNs(evictionTimeNs);
626    map.put(evictionTimeNs, replica);
627  }
628
629  /**
630   * Purge a replica from the cache.
631   *
632   * This doesn't necessarily close the replica, since there may be
633   * outstanding references to it.  However, it does mean the cache won't
634   * hand it out to anyone after this.
635   *
636   * You must hold the cache lock while calling this function.
637   *
638   * @param replica   The replica being removed.
639   */
640  private void purge(ShortCircuitReplica replica) {
641    boolean removedFromInfoMap = false;
642    String evictionMapName = null;
643    Preconditions.checkArgument(!replica.purged);
644    replica.purged = true;
645    Waitable<ShortCircuitReplicaInfo> val = replicaInfoMap.get(replica.key);
646    if (val != null) {
647      ShortCircuitReplicaInfo info = val.getVal();
648      if ((info != null) && (info.getReplica() == replica)) {
649        replicaInfoMap.remove(replica.key);
650        removedFromInfoMap = true;
651      }
652    }
653    Long evictableTimeNs = replica.getEvictableTimeNs();
654    if (evictableTimeNs != null) {
655      evictionMapName = removeEvictable(replica);
656    }
657    if (LOG.isTraceEnabled()) {
658      StringBuilder builder = new StringBuilder();
659      builder.append(this).append(": ").append(": purged ").
660          append(replica).append(" from the cache.");
661      if (removedFromInfoMap) {
662        builder.append("  Removed from the replicaInfoMap.");
663      }
664      if (evictionMapName != null) {
665        builder.append("  Removed from ").append(evictionMapName);
666      }
667      LOG.trace(builder.toString());
668    }
669    unref(replica);
670  }
671
672  /**
673   * Fetch or create a replica.
674   *
675   * You must hold the cache lock while calling this function.
676   *
677   * @param key          Key to use for lookup.
678   * @param creator      Replica creator callback.  Will be called without
679   *                     the cache lock being held.
680   *
681   * @return             Null if no replica could be found or created.
682   *                     The replica, otherwise.
683   */
684  public ShortCircuitReplicaInfo fetchOrCreate(ExtendedBlockId key,
685      ShortCircuitReplicaCreator creator) {
686    Waitable<ShortCircuitReplicaInfo> newWaitable = null;
687    lock.lock();
688    try {
689      ShortCircuitReplicaInfo info = null;
690      do {
691        if (closed) {
692          if (LOG.isTraceEnabled()) {
693            LOG.trace(this + ": can't fetchOrCreate " + key +
694                " because the cache is closed.");
695          }
696          return null;
697        }
698        Waitable<ShortCircuitReplicaInfo> waitable = replicaInfoMap.get(key);
699        if (waitable != null) {
700          try {
701            info = fetch(key, waitable);
702          } catch (RetriableException e) {
703            if (LOG.isDebugEnabled()) {
704              LOG.debug(this + ": retrying " + e.getMessage());
705            }
706            continue;
707          }
708        }
709      } while (false);
710      if (info != null) return info;
711      // We need to load the replica ourselves.
712      newWaitable = new Waitable<ShortCircuitReplicaInfo>(lock.newCondition());
713      replicaInfoMap.put(key, newWaitable);
714    } finally {
715      lock.unlock();
716    }
717    return create(key, creator, newWaitable);
718  }
719
720  /**
721   * Fetch an existing ReplicaInfo object.
722   *
723   * @param key       The key that we're using.
724   * @param waitable  The waitable object to wait on.
725   * @return          The existing ReplicaInfo object, or null if there is
726   *                  none.
727   *
728   * @throws RetriableException   If the caller needs to retry.
729   */
730  private ShortCircuitReplicaInfo fetch(ExtendedBlockId key,
731      Waitable<ShortCircuitReplicaInfo> waitable) throws RetriableException {
732    // Another thread is already in the process of loading this
733    // ShortCircuitReplica.  So we simply wait for it to complete.
734    ShortCircuitReplicaInfo info;
735    try {
736      if (LOG.isTraceEnabled()) {
737        LOG.trace(this + ": found waitable for " + key);
738      }
739      info = waitable.await();
740    } catch (InterruptedException e) {
741      LOG.info(this + ": interrupted while waiting for " + key);
742      Thread.currentThread().interrupt();
743      throw new RetriableException("interrupted");
744    }
745    if (info.getInvalidTokenException() != null) {
746      LOG.warn(this + ": could not get " + key + " due to InvalidToken " +
747            "exception.", info.getInvalidTokenException());
748      return info;
749    }
750    ShortCircuitReplica replica = info.getReplica();
751    if (replica == null) {
752      LOG.warn(this + ": failed to get " + key);
753      return info;
754    }
755    if (replica.purged) {
756      // Ignore replicas that have already been purged from the cache.
757      throw new RetriableException("Ignoring purged replica " +
758          replica + ".  Retrying.");
759    }
760    // Check if the replica is stale before using it.
761    // If it is, purge it and retry.
762    if (replica.isStale()) {
763      LOG.info(this + ": got stale replica " + replica + ".  Removing " +
764          "this replica from the replicaInfoMap and retrying.");
765      // Remove the cache's reference to the replica.  This may or may not
766      // trigger a close.
767      purge(replica);
768      throw new RetriableException("ignoring stale replica " + replica);
769    }
770    ref(replica);
771    return info;
772  }
773
774  private ShortCircuitReplicaInfo create(ExtendedBlockId key,
775      ShortCircuitReplicaCreator creator,
776      Waitable<ShortCircuitReplicaInfo> newWaitable) {
777    // Handle loading a new replica.
778    ShortCircuitReplicaInfo info = null;
779    try {
780      if (LOG.isTraceEnabled()) {
781        LOG.trace(this + ": loading " + key);
782      }
783      info = creator.createShortCircuitReplicaInfo();
784    } catch (RuntimeException e) {
785      LOG.warn(this + ": failed to load " + key, e);
786    }
787    if (info == null) info = new ShortCircuitReplicaInfo();
788    lock.lock();
789    try {
790      if (info.getReplica() != null) {
791        // On success, make sure the cache cleaner thread is running.
792        if (LOG.isTraceEnabled()) {
793          LOG.trace(this + ": successfully loaded " + info.getReplica());
794        }
795        startCacheCleanerThreadIfNeeded();
796        // Note: new ShortCircuitReplicas start with a refCount of 2,
797        // indicating that both this cache and whoever requested the 
798        // creation of the replica hold a reference.  So we don't need
799        // to increment the reference count here.
800      } else {
801        // On failure, remove the waitable from the replicaInfoMap.
802        Waitable<ShortCircuitReplicaInfo> waitableInMap = replicaInfoMap.get(key);
803        if (waitableInMap == newWaitable) replicaInfoMap.remove(key);
804        if (info.getInvalidTokenException() != null) {
805          LOG.warn(this + ": could not load " + key + " due to InvalidToken " +
806              "exception.", info.getInvalidTokenException());
807        } else {
808          LOG.warn(this + ": failed to load " + key);
809        }
810      }
811      newWaitable.provide(info);
812    } finally {
813      lock.unlock();
814    }
815    return info;
816  }
817
818  private void startCacheCleanerThreadIfNeeded() {
819    if (cacheCleaner == null) {
820      cacheCleaner = new CacheCleaner();
821      long rateMs = cacheCleaner.getRateInMs();
822      ScheduledFuture<?> future =
823          cleanerExecutor.scheduleAtFixedRate(cacheCleaner, rateMs, rateMs,
824              TimeUnit.MILLISECONDS);
825      cacheCleaner.setFuture(future);
826      if (LOG.isDebugEnabled()) {
827        LOG.debug(this + ": starting cache cleaner thread which will run " +
828          "every " + rateMs + " ms");
829      }
830    }
831  }
832
833  ClientMmap getOrCreateClientMmap(ShortCircuitReplica replica,
834      boolean anchored) {
835    Condition newCond;
836    lock.lock();
837    try {
838      while (replica.mmapData != null) {
839        if (replica.mmapData instanceof MappedByteBuffer) {
840          ref(replica);
841          MappedByteBuffer mmap = (MappedByteBuffer)replica.mmapData;
842          return new ClientMmap(replica, mmap, anchored);
843        } else if (replica.mmapData instanceof Long) {
844          long lastAttemptTimeMs = (Long)replica.mmapData;
845          long delta = Time.monotonicNow() - lastAttemptTimeMs;
846          if (delta < mmapRetryTimeoutMs) {
847            if (LOG.isTraceEnabled()) {
848              LOG.trace(this + ": can't create client mmap for " +
849                  replica + " because we failed to " +
850                  "create one just " + delta + "ms ago.");
851            }
852            return null;
853          }
854          if (LOG.isTraceEnabled()) {
855            LOG.trace(this + ": retrying client mmap for " + replica +
856                ", " + delta + " ms after the previous failure.");
857          }
858        } else if (replica.mmapData instanceof Condition) {
859          Condition cond = (Condition)replica.mmapData;
860          cond.awaitUninterruptibly();
861        } else {
862          Preconditions.checkState(false, "invalid mmapData type " +
863              replica.mmapData.getClass().getName());
864        }
865      }
866      newCond = lock.newCondition();
867      replica.mmapData = newCond;
868    } finally {
869      lock.unlock();
870    }
871    MappedByteBuffer map = replica.loadMmapInternal();
872    lock.lock();
873    try {
874      if (map == null) {
875        replica.mmapData = Long.valueOf(Time.monotonicNow());
876        newCond.signalAll();
877        return null;
878      } else {
879        outstandingMmapCount++;
880        replica.mmapData = map;
881        ref(replica);
882        newCond.signalAll();
883        return new ClientMmap(replica, map, anchored);
884      }
885    } finally {
886      lock.unlock();
887    }
888  }
889
890  /**
891   * Close the cache and free all associated resources.
892   */
893  @Override
894  public void close() {
895    try {
896      lock.lock();
897      if (closed) return;
898      closed = true;
899      LOG.info(this + ": closing");
900      maxNonMmappedEvictableLifespanMs = 0;
901      maxEvictableMmapedSize = 0;
902      // Close and join cacheCleaner thread.
903      IOUtils.cleanup(LOG, cacheCleaner);
904      // Purge all replicas.
905      while (true) {
906        Entry<Long, ShortCircuitReplica> entry = evictable.firstEntry();
907        if (entry == null) break;
908        purge(entry.getValue());
909      }
910      while (true) {
911        Entry<Long, ShortCircuitReplica> entry = evictableMmapped.firstEntry();
912        if (entry == null) break;
913        purge(entry.getValue());
914      }
915    } finally {
916      lock.unlock();
917    }
918    IOUtils.cleanup(LOG, shmManager);
919  }
920
921  @VisibleForTesting // ONLY for testing
922  public interface CacheVisitor {
923    void visit(int numOutstandingMmaps,
924        Map<ExtendedBlockId, ShortCircuitReplica> replicas,
925        Map<ExtendedBlockId, InvalidToken> failedLoads,
926        Map<Long, ShortCircuitReplica> evictable,
927        Map<Long, ShortCircuitReplica> evictableMmapped);
928  }
929
930  @VisibleForTesting // ONLY for testing
931  public void accept(CacheVisitor visitor) {
932    lock.lock();
933    try {
934      Map<ExtendedBlockId, ShortCircuitReplica> replicas =
935          new HashMap<ExtendedBlockId, ShortCircuitReplica>();
936      Map<ExtendedBlockId, InvalidToken> failedLoads =
937          new HashMap<ExtendedBlockId, InvalidToken>();
938      for (Entry<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>> entry :
939            replicaInfoMap.entrySet()) {
940        Waitable<ShortCircuitReplicaInfo> waitable = entry.getValue();
941        if (waitable.hasVal()) {
942          if (waitable.getVal().getReplica() != null) {
943            replicas.put(entry.getKey(), waitable.getVal().getReplica());
944          } else {
945            // The exception may be null here, indicating a failed load that
946            // isn't the result of an invalid block token.
947            failedLoads.put(entry.getKey(),
948                waitable.getVal().getInvalidTokenException());
949          }
950        }
951      }
952      if (LOG.isDebugEnabled()) {
953        StringBuilder builder = new StringBuilder();
954        builder.append("visiting ").append(visitor.getClass().getName()).
955            append("with outstandingMmapCount=").append(outstandingMmapCount).
956            append(", replicas=");
957        String prefix = "";
958        for (Entry<ExtendedBlockId, ShortCircuitReplica> entry : replicas.entrySet()) {
959          builder.append(prefix).append(entry.getValue());
960          prefix = ",";
961        }
962        prefix = "";
963        builder.append(", failedLoads=");
964        for (Entry<ExtendedBlockId, InvalidToken> entry : failedLoads.entrySet()) {
965          builder.append(prefix).append(entry.getValue());
966          prefix = ",";
967        }
968        prefix = "";
969        builder.append(", evictable=");
970        for (Entry<Long, ShortCircuitReplica> entry : evictable.entrySet()) {
971          builder.append(prefix).append(entry.getKey()).
972              append(":").append(entry.getValue());
973          prefix = ",";
974        }
975        prefix = "";
976        builder.append(", evictableMmapped=");
977        for (Entry<Long, ShortCircuitReplica> entry : evictableMmapped.entrySet()) {
978          builder.append(prefix).append(entry.getKey()).
979              append(":").append(entry.getValue());
980          prefix = ",";
981        }
982        LOG.debug(builder.toString());
983      }
984      visitor.visit(outstandingMmapCount, replicas, failedLoads,
985            evictable, evictableMmapped);
986    } finally {
987      lock.unlock();
988    }
989  }
990
991  @Override
992  public String toString() {
993    return "ShortCircuitCache(0x" +
994        Integer.toHexString(System.identityHashCode(this)) + ")";
995  }
996
997  /**
998   * Allocate a new shared memory slot.
999   *
1000   * @param datanode       The datanode to allocate a shm slot with.
1001   * @param peer           A peer connected to the datanode.
1002   * @param usedPeer       Will be set to true if we use up the provided peer.
1003   * @param blockId        The block id and block pool id of the block we're 
1004   *                         allocating this slot for.
1005   * @param clientName     The name of the DFSClient allocating the shared
1006   *                         memory.
1007   * @return               Null if short-circuit shared memory is disabled;
1008   *                         a short-circuit memory slot otherwise.
1009   * @throws IOException   An exception if there was an error talking to 
1010   *                         the datanode.
1011   */
1012  public Slot allocShmSlot(DatanodeInfo datanode,
1013        DomainPeer peer, MutableBoolean usedPeer,
1014        ExtendedBlockId blockId, String clientName) throws IOException {
1015    if (shmManager != null) {
1016      return shmManager.allocSlot(datanode, peer, usedPeer,
1017          blockId, clientName);
1018    } else {
1019      return null;
1020    }
1021  }
1022
1023  /**
1024   * Free a slot immediately.
1025   *
1026   * ONLY use this if the DataNode is not yet aware of the slot.
1027   * 
1028   * @param slot           The slot to free.
1029   */
1030  public void freeSlot(Slot slot) {
1031    Preconditions.checkState(shmManager != null);
1032    slot.makeInvalid();
1033    shmManager.freeSlot(slot);
1034  }
1035  
1036  /**
1037   * Schedule a shared memory slot to be released.
1038   *
1039   * @param slot           The slot to release.
1040   */
1041  public void scheduleSlotReleaser(Slot slot) {
1042    Preconditions.checkState(shmManager != null);
1043    releaserExecutor.execute(new SlotReleaser(slot));
1044  }
1045
1046  @VisibleForTesting
1047  public DfsClientShmManager getDfsClientShmManager() {
1048    return shmManager;
1049  }
1050}