001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.namenode;
019
020import static org.apache.hadoop.crypto.key.KeyProviderCryptoExtension.EncryptedKeyVersion;
021import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_DEFAULT;
022import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_KEY;
023import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_DEFAULT;
024import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_KEY;
025import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_SIZE_DEFAULT;
026import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_SIZE_KEY;
027import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_DEFAULT;
028import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY;
029import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CHECKSUM_TYPE_DEFAULT;
030import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CHECKSUM_TYPE_KEY;
031import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_WRITE_PACKET_SIZE_DEFAULT;
032import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_WRITE_PACKET_SIZE_KEY;
033import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_DEFAULT;
034import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_KEY;
035import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_DEFAULT;
036import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_KEY;
037import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_DEFAULT;
038import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_KEY;
039import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOGGERS_KEY;
040import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_ASYNC_DEFAULT;
041import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_ASYNC_KEY;
042import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT;
043import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY;
044import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT;
045import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY;
046import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DEFAULT_AUDIT_LOGGER_NAME;
047import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT;
048import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY;
049import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT;
050import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY;
051import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT;
052import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY;
053import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT;
054import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY;
055import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY;
056import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY;
057import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS;
058import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS_DEFAULT;
059import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD;
060import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD_DEFAULT;
061import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_DEFAULT;
062import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY;
063import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC;
064import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC_DEFAULT;
065import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_DEFAULT;
066import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_KEY;
067import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY;
068import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_DEFAULT;
069import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_KEY;
070import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY;
071import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT;
072import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY;
073import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_DEFAULT;
074import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_KEY;
075import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_DEFAULT;
076import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_KEY;
077import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY;
078import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT;
079import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY;
080import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT;
081import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY;
082import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY;
083import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_DEFAULT;
084import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY;
085import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT;
086import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_KEY;
087import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_DEFAULT;
088import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_KEY;
089import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_STORAGE_POLICY_ENABLED_KEY;
090import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_STORAGE_POLICY_ENABLED_DEFAULT;
091import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SUPPORT_APPEND_DEFAULT;
092import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SUPPORT_APPEND_KEY;
093import static org.apache.hadoop.hdfs.server.common.HdfsServerConstants.SECURITY_XATTR_UNREADABLE_BY_SUPERUSER;
094import static org.apache.hadoop.util.Time.now;
095
096import java.io.BufferedWriter;
097import java.io.ByteArrayInputStream;
098import java.io.DataInput;
099import java.io.DataInputStream;
100import java.io.DataOutputStream;
101import java.io.File;
102import java.io.FileNotFoundException;
103import java.io.FileOutputStream;
104import java.io.IOException;
105import java.io.OutputStreamWriter;
106import java.io.PrintWriter;
107import java.io.StringWriter;
108import java.lang.management.ManagementFactory;
109import java.net.InetAddress;
110import java.net.URI;
111import java.security.GeneralSecurityException;
112import java.security.NoSuchAlgorithmException;
113import java.util.ArrayList;
114import java.util.Arrays;
115import java.util.Collection;
116import java.util.Collections;
117import java.util.Date;
118import java.util.EnumSet;
119import java.util.HashMap;
120import java.util.HashSet;
121import java.util.Iterator;
122import java.util.LinkedHashSet;
123import java.util.List;
124import java.util.Map;
125import java.util.Set;
126import java.util.UUID;
127import java.util.concurrent.TimeUnit;
128import java.util.concurrent.locks.Condition;
129import java.util.concurrent.locks.ReentrantLock;
130import java.util.concurrent.locks.ReentrantReadWriteLock;
131
132import javax.management.NotCompliantMBeanException;
133import javax.management.ObjectName;
134import javax.management.StandardMBean;
135
136import org.apache.commons.logging.Log;
137import org.apache.commons.logging.LogFactory;
138import org.apache.commons.logging.impl.Log4JLogger;
139import org.apache.hadoop.HadoopIllegalArgumentException;
140import org.apache.hadoop.classification.InterfaceAudience;
141import org.apache.hadoop.conf.Configuration;
142import org.apache.hadoop.crypto.CipherSuite;
143import org.apache.hadoop.crypto.CryptoProtocolVersion;
144import org.apache.hadoop.crypto.key.KeyProvider;
145import org.apache.hadoop.crypto.CryptoCodec;
146import org.apache.hadoop.crypto.key.KeyProviderCryptoExtension;
147import org.apache.hadoop.fs.BatchedRemoteIterator.BatchedListEntries;
148import org.apache.hadoop.fs.CacheFlag;
149import org.apache.hadoop.fs.ContentSummary;
150import org.apache.hadoop.fs.CreateFlag;
151import org.apache.hadoop.fs.DirectoryListingStartAfterNotFoundException;
152import org.apache.hadoop.fs.FileAlreadyExistsException;
153import org.apache.hadoop.fs.FileEncryptionInfo;
154import org.apache.hadoop.fs.FileStatus;
155import org.apache.hadoop.fs.FileSystem;
156import org.apache.hadoop.fs.FsServerDefaults;
157import org.apache.hadoop.fs.InvalidPathException;
158import org.apache.hadoop.fs.Options;
159import org.apache.hadoop.fs.Options.Rename;
160import org.apache.hadoop.fs.ParentNotDirectoryException;
161import org.apache.hadoop.fs.Path;
162import org.apache.hadoop.fs.PathIsNotEmptyDirectoryException;
163import org.apache.hadoop.fs.UnresolvedLinkException;
164import org.apache.hadoop.fs.XAttr;
165import org.apache.hadoop.fs.XAttrSetFlag;
166import org.apache.hadoop.fs.permission.AclEntry;
167import org.apache.hadoop.fs.permission.AclStatus;
168import org.apache.hadoop.fs.permission.FsAction;
169import org.apache.hadoop.fs.permission.FsPermission;
170import org.apache.hadoop.fs.permission.PermissionStatus;
171import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
172import org.apache.hadoop.ha.ServiceFailedException;
173import org.apache.hadoop.hdfs.protocol.BlockStoragePolicy;
174import org.apache.hadoop.hdfs.DFSConfigKeys;
175import org.apache.hadoop.hdfs.DFSUtil;
176import org.apache.hadoop.hdfs.HAUtil;
177import org.apache.hadoop.hdfs.HdfsConfiguration;
178import org.apache.hadoop.hdfs.UnknownCryptoProtocolVersionException;
179import org.apache.hadoop.hdfs.XAttrHelper;
180import org.apache.hadoop.hdfs.protocol.AclException;
181import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException;
182import org.apache.hadoop.hdfs.protocol.Block;
183import org.apache.hadoop.hdfs.protocol.CacheDirectiveEntry;
184import org.apache.hadoop.hdfs.protocol.CacheDirectiveInfo;
185import org.apache.hadoop.hdfs.protocol.CachePoolEntry;
186import org.apache.hadoop.hdfs.protocol.CachePoolInfo;
187import org.apache.hadoop.hdfs.protocol.ClientProtocol;
188import org.apache.hadoop.hdfs.protocol.DatanodeID;
189import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
190import org.apache.hadoop.hdfs.protocol.DirectoryListing;
191import org.apache.hadoop.hdfs.protocol.EncryptionZone;
192import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
193import org.apache.hadoop.hdfs.protocol.HdfsConstants;
194import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
195import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
196import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
197import org.apache.hadoop.hdfs.protocol.LocatedBlock;
198import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
199import org.apache.hadoop.hdfs.protocol.QuotaExceededException;
200import org.apache.hadoop.hdfs.protocol.RecoveryInProgressException;
201import org.apache.hadoop.hdfs.protocol.RollingUpgradeException;
202import org.apache.hadoop.hdfs.protocol.RollingUpgradeInfo;
203import org.apache.hadoop.hdfs.protocol.SnapshotAccessControlException;
204import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport;
205import org.apache.hadoop.hdfs.protocol.SnapshottableDirectoryStatus;
206import org.apache.hadoop.hdfs.protocol.datatransfer.ReplaceDatanodeOnFailure;
207import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager;
208import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager.AccessMode;
209import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier;
210import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager;
211import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager.SecretManagerState;
212import org.apache.hadoop.hdfs.server.blockmanagement.BlockCollection;
213import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
214import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction;
215import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
216import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
217import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager;
218import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStatistics;
219import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo;
220import org.apache.hadoop.hdfs.server.blockmanagement.OutOfV1GenerationStampsException;
221import org.apache.hadoop.hdfs.server.common.GenerationStamp;
222import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState;
223import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
224import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.RollingUpgradeStartupOption;
225import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
226import org.apache.hadoop.hdfs.server.common.Storage;
227import org.apache.hadoop.hdfs.server.common.Storage.StorageDirType;
228import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
229import org.apache.hadoop.hdfs.server.common.Util;
230import org.apache.hadoop.hdfs.server.namenode.FsImageProto.SecretManagerSection;
231import org.apache.hadoop.hdfs.server.namenode.INode.BlocksMapUpdateInfo;
232import org.apache.hadoop.hdfs.server.namenode.JournalSet.JournalAndStream;
233import org.apache.hadoop.hdfs.server.namenode.LeaseManager.Lease;
234import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile;
235import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory;
236import org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer;
237import org.apache.hadoop.hdfs.server.namenode.ha.HAContext;
238import org.apache.hadoop.hdfs.server.namenode.ha.StandbyCheckpointer;
239import org.apache.hadoop.hdfs.server.namenode.metrics.FSNamesystemMBean;
240import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
241import org.apache.hadoop.hdfs.server.namenode.snapshot.DirectoryWithSnapshotFeature;
242import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
243import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotManager;
244import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
245import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
246import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter;
247import org.apache.hadoop.hdfs.server.namenode.startupprogress.Status;
248import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step;
249import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType;
250import org.apache.hadoop.hdfs.server.namenode.web.resources.NamenodeWebHdfsMethods;
251import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
252import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
253import org.apache.hadoop.hdfs.server.protocol.DatanodeStorageReport;
254import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse;
255import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat;
256import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand;
257import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
258import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
259import org.apache.hadoop.hdfs.server.protocol.StorageReceivedDeletedBlocks;
260import org.apache.hadoop.hdfs.server.protocol.StorageReport;
261import org.apache.hadoop.hdfs.util.ChunkedArrayList;
262import org.apache.hadoop.io.IOUtils;
263import org.apache.hadoop.io.Text;
264import org.apache.hadoop.ipc.RetriableException;
265import org.apache.hadoop.ipc.RetryCache;
266import org.apache.hadoop.ipc.RetryCache.CacheEntry;
267import org.apache.hadoop.ipc.RetryCache.CacheEntryWithPayload;
268import org.apache.hadoop.ipc.Server;
269import org.apache.hadoop.ipc.StandbyException;
270import org.apache.hadoop.metrics2.annotation.Metric;
271import org.apache.hadoop.metrics2.annotation.Metrics;
272import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
273import org.apache.hadoop.metrics2.util.MBeans;
274import org.apache.hadoop.net.NetworkTopology;
275import org.apache.hadoop.net.Node;
276import org.apache.hadoop.net.NodeBase;
277import org.apache.hadoop.security.AccessControlException;
278import org.apache.hadoop.security.UserGroupInformation;
279import org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod;
280import org.apache.hadoop.security.token.SecretManager.InvalidToken;
281import org.apache.hadoop.security.token.Token;
282import org.apache.hadoop.security.token.TokenIdentifier;
283import org.apache.hadoop.security.token.delegation.DelegationKey;
284import org.apache.hadoop.util.Daemon;
285import org.apache.hadoop.util.DataChecksum;
286import org.apache.hadoop.util.StringUtils;
287import org.apache.hadoop.util.Time;
288import org.apache.hadoop.util.VersionInfo;
289import org.apache.log4j.Appender;
290import org.apache.log4j.AsyncAppender;
291import org.apache.log4j.Logger;
292import org.mortbay.util.ajax.JSON;
293
294import com.google.common.annotations.VisibleForTesting;
295import com.google.common.base.Charsets;
296import com.google.common.base.Preconditions;
297import com.google.common.collect.ImmutableMap;
298import com.google.common.collect.Lists;
299
300/***************************************************
301 * FSNamesystem does the actual bookkeeping work for the
302 * DataNode.
303 *
304 * It tracks several important tables.
305 *
306 * 1)  valid fsname --> blocklist  (kept on disk, logged)
307 * 2)  Set of all valid blocks (inverted #1)
308 * 3)  block --> machinelist (kept in memory, rebuilt dynamically from reports)
309 * 4)  machine --> blocklist (inverted #2)
310 * 5)  LRU cache of updated-heartbeat machines
311 ***************************************************/
312@InterfaceAudience.Private
313@Metrics(context="dfs")
314public class FSNamesystem implements Namesystem, FSClusterStats,
315    FSNamesystemMBean, NameNodeMXBean {
316  public static final Log LOG = LogFactory.getLog(FSNamesystem.class);
317
318  private static final ThreadLocal<StringBuilder> auditBuffer =
319    new ThreadLocal<StringBuilder>() {
320      @Override
321      protected StringBuilder initialValue() {
322        return new StringBuilder();
323      }
324  };
325
326  @VisibleForTesting
327  public boolean isAuditEnabled() {
328    return !isDefaultAuditLogger || auditLog.isInfoEnabled();
329  }
330
331  private HdfsFileStatus getAuditFileInfo(String path, boolean resolveSymlink)
332      throws IOException {
333    return (isAuditEnabled() && isExternalInvocation())
334        ? dir.getFileInfo(path, resolveSymlink, false, false) : null;
335  }
336  
337  private void logAuditEvent(boolean succeeded, String cmd, String src)
338      throws IOException {
339    logAuditEvent(succeeded, cmd, src, null, null);
340  }
341  
342  private void logAuditEvent(boolean succeeded, String cmd, String src,
343      String dst, HdfsFileStatus stat) throws IOException {
344    if (isAuditEnabled() && isExternalInvocation()) {
345      logAuditEvent(succeeded, getRemoteUser(), getRemoteIp(),
346                    cmd, src, dst, stat);
347    }
348  }
349
350  private void logAuditEvent(boolean succeeded,
351      UserGroupInformation ugi, InetAddress addr, String cmd, String src,
352      String dst, HdfsFileStatus stat) {
353    FileStatus status = null;
354    if (stat != null) {
355      Path symlink = stat.isSymlink() ? new Path(stat.getSymlink()) : null;
356      Path path = dst != null ? new Path(dst) : new Path(src);
357      status = new FileStatus(stat.getLen(), stat.isDir(),
358          stat.getReplication(), stat.getBlockSize(), stat.getModificationTime(),
359          stat.getAccessTime(), stat.getPermission(), stat.getOwner(),
360          stat.getGroup(), symlink, path);
361    }
362    for (AuditLogger logger : auditLoggers) {
363      if (logger instanceof HdfsAuditLogger) {
364        HdfsAuditLogger hdfsLogger = (HdfsAuditLogger) logger;
365        hdfsLogger.logAuditEvent(succeeded, ugi.toString(), addr, cmd, src, dst,
366            status, ugi, dtSecretManager);
367      } else {
368        logger.logAuditEvent(succeeded, ugi.toString(), addr,
369            cmd, src, dst, status);
370      }
371    }
372  }
373
374  /**
375   * Logger for audit events, noting successful FSNamesystem operations. Emits
376   * to FSNamesystem.audit at INFO. Each event causes a set of tab-separated
377   * <code>key=value</code> pairs to be written for the following properties:
378   * <code>
379   * ugi=&lt;ugi in RPC&gt;
380   * ip=&lt;remote IP&gt;
381   * cmd=&lt;command&gt;
382   * src=&lt;src path&gt;
383   * dst=&lt;dst path (optional)&gt;
384   * perm=&lt;permissions (optional)&gt;
385   * </code>
386   */
387  public static final Log auditLog = LogFactory.getLog(
388      FSNamesystem.class.getName() + ".audit");
389
390  static final int DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED = 100;
391  static int BLOCK_DELETION_INCREMENT = 1000;
392  private final boolean isPermissionEnabled;
393  private final UserGroupInformation fsOwner;
394  private final String fsOwnerShortUserName;
395  private final String supergroup;
396  private final boolean standbyShouldCheckpoint;
397  
398  // Scan interval is not configurable.
399  private static final long DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL =
400    TimeUnit.MILLISECONDS.convert(1, TimeUnit.HOURS);
401  final DelegationTokenSecretManager dtSecretManager;
402  private final boolean alwaysUseDelegationTokensForTests;
403
404  private static final Step STEP_AWAITING_REPORTED_BLOCKS =
405    new Step(StepType.AWAITING_REPORTED_BLOCKS);
406
407  // Tracks whether the default audit logger is the only configured audit
408  // logger; this allows isAuditEnabled() to return false in case the
409  // underlying logger is disabled, and avoid some unnecessary work.
410  private final boolean isDefaultAuditLogger;
411  private final List<AuditLogger> auditLoggers;
412
413  /** The namespace tree. */
414  FSDirectory dir;
415  private final BlockManager blockManager;
416  private final SnapshotManager snapshotManager;
417  private final CacheManager cacheManager;
418  private final DatanodeStatistics datanodeStatistics;
419
420  // whether setStoragePolicy is allowed.
421  private final boolean isStoragePolicyEnabled;
422
423  private String nameserviceId;
424
425  private volatile RollingUpgradeInfo rollingUpgradeInfo = null;
426  /**
427   * A flag that indicates whether the checkpointer should checkpoint a rollback
428   * fsimage. The edit log tailer sets this flag. The checkpoint will create a
429   * rollback fsimage if the flag is true, and then change the flag to false.
430   */
431  private volatile boolean needRollbackFsImage;
432
433  // Block pool ID used by this namenode
434  private String blockPoolId;
435
436  final LeaseManager leaseManager = new LeaseManager(this); 
437
438  volatile Daemon smmthread = null;  // SafeModeMonitor thread
439  
440  Daemon nnrmthread = null; // NamenodeResourceMonitor thread
441
442  Daemon nnEditLogRoller = null; // NameNodeEditLogRoller thread
443
444  // A daemon to periodically clean up corrupt lazyPersist files
445  // from the name space.
446  Daemon lazyPersistFileScrubber = null;
447  /**
448   * When an active namenode will roll its own edit log, in # edits
449   */
450  private final long editLogRollerThreshold;
451  /**
452   * Check interval of an active namenode's edit log roller thread 
453   */
454  private final int editLogRollerInterval;
455
456  /**
457   * How frequently we scan and unlink corrupt lazyPersist files.
458   * (In seconds)
459   */
460  private final int lazyPersistFileScrubIntervalSec;
461
462  private volatile boolean hasResourcesAvailable = false;
463  private volatile boolean fsRunning = true;
464  
465  /** The start time of the namesystem. */
466  private final long startTime = now();
467
468  /** The interval of namenode checking for the disk space availability */
469  private final long resourceRecheckInterval;
470
471  // The actual resource checker instance.
472  NameNodeResourceChecker nnResourceChecker;
473
474  private final FsServerDefaults serverDefaults;
475  private final boolean supportAppends;
476  private final ReplaceDatanodeOnFailure dtpReplaceDatanodeOnFailure;
477
478  private volatile SafeModeInfo safeMode;  // safe mode information
479
480  private final long maxFsObjects;          // maximum number of fs objects
481
482  private final long minBlockSize;         // minimum block size
483  private final long maxBlocksPerFile;     // maximum # of blocks per file
484
485  /**
486   * The global generation stamp for legacy blocks with randomly
487   * generated block IDs.
488   */
489  private final GenerationStamp generationStampV1 = new GenerationStamp();
490
491  /**
492   * The global generation stamp for this file system.
493   */
494  private final GenerationStamp generationStampV2 = new GenerationStamp();
495
496  /**
497   * The value of the generation stamp when the first switch to sequential
498   * block IDs was made. Blocks with generation stamps below this value
499   * have randomly allocated block IDs. Blocks with generation stamps above
500   * this value had sequentially allocated block IDs. Read from the fsImage
501   * (or initialized as an offset from the V1 (legacy) generation stamp on
502   * upgrade).
503   */
504  private long generationStampV1Limit =
505      GenerationStamp.GRANDFATHER_GENERATION_STAMP;
506
507  /**
508   * The global block ID space for this file system.
509   */
510  @VisibleForTesting
511  private final SequentialBlockIdGenerator blockIdGenerator;
512
513  // precision of access times.
514  private final long accessTimePrecision;
515
516  /** Lock to protect FSNamesystem. */
517  private final FSNamesystemLock fsLock;
518
519  /**
520   * Used when this NN is in standby state to read from the shared edit log.
521   */
522  private EditLogTailer editLogTailer = null;
523
524  /**
525   * Used when this NN is in standby state to perform checkpoints.
526   */
527  private StandbyCheckpointer standbyCheckpointer;
528
529  /**
530   * Reference to the NN's HAContext object. This is only set once
531   * {@link #startCommonServices(Configuration, HAContext)} is called. 
532   */
533  private HAContext haContext;
534
535  private final boolean haEnabled;
536
537  /** flag indicating whether replication queues have been initialized */
538  boolean initializedReplQueues = false;
539
540  /**
541   * Whether the namenode is in the middle of starting the active service
542   */
543  private volatile boolean startingActiveService = false;
544    
545  private INodeId inodeId;
546  
547  private final RetryCache retryCache;
548
549  private final NNConf nnConf;
550
551  private KeyProviderCryptoExtension provider = null;
552  private KeyProvider.Options providerOptions = null;
553
554  private final CryptoCodec codec;
555
556  private volatile boolean imageLoaded = false;
557  private final Condition cond;
558
559  private final FSImage fsImage;
560
561  /**
562   * Notify that loading of this FSDirectory is complete, and
563   * it is imageLoaded for use
564   */
565  void imageLoadComplete() {
566    Preconditions.checkState(!imageLoaded, "FSDirectory already loaded");
567    setImageLoaded();
568  }
569
570  void setImageLoaded() {
571    if(imageLoaded) return;
572    writeLock();
573    try {
574      setImageLoaded(true);
575      dir.markNameCacheInitialized();
576      cond.signalAll();
577    } finally {
578      writeUnlock();
579    }
580  }
581
582  //This is for testing purposes only
583  @VisibleForTesting
584  boolean isImageLoaded() {
585    return imageLoaded;
586  }
587
588  // exposed for unit tests
589  protected void setImageLoaded(boolean flag) {
590    imageLoaded = flag;
591  }
592
593  /**
594   * Block until the object is imageLoaded to be used.
595   */
596  void waitForLoadingFSImage() {
597    if (!imageLoaded) {
598      writeLock();
599      try {
600        while (!imageLoaded) {
601          try {
602            cond.await(5000, TimeUnit.MILLISECONDS);
603          } catch (InterruptedException ignored) {
604          }
605        }
606      } finally {
607        writeUnlock();
608      }
609    }
610  }
611
612  /**
613   * Set the last allocated inode id when fsimage or editlog is loaded. 
614   */
615  public void resetLastInodeId(long newValue) throws IOException {
616    try {
617      inodeId.skipTo(newValue);
618    } catch(IllegalStateException ise) {
619      throw new IOException(ise);
620    }
621  }
622
623  /** Should only be used for tests to reset to any value */
624  void resetLastInodeIdWithoutChecking(long newValue) {
625    inodeId.setCurrentValue(newValue);
626  }
627  
628  /** @return the last inode ID. */
629  public long getLastInodeId() {
630    return inodeId.getCurrentValue();
631  }
632
633  /** Allocate a new inode ID. */
634  public long allocateNewInodeId() {
635    return inodeId.nextValue();
636  }
637  
638  /**
639   * Clear all loaded data
640   */
641  void clear() {
642    dir.reset();
643    dtSecretManager.reset();
644    generationStampV1.setCurrentValue(GenerationStamp.LAST_RESERVED_STAMP);
645    generationStampV2.setCurrentValue(GenerationStamp.LAST_RESERVED_STAMP);
646    blockIdGenerator.setCurrentValue(
647        SequentialBlockIdGenerator.LAST_RESERVED_BLOCK_ID);
648    generationStampV1Limit = GenerationStamp.GRANDFATHER_GENERATION_STAMP;
649    leaseManager.removeAllLeases();
650    inodeId.setCurrentValue(INodeId.LAST_RESERVED_ID);
651    snapshotManager.clearSnapshottableDirs();
652    cacheManager.clear();
653    setImageLoaded(false);
654    blockManager.clear();
655  }
656
657  @VisibleForTesting
658  LeaseManager getLeaseManager() {
659    return leaseManager;
660  }
661  
662  boolean isHaEnabled() {
663    return haEnabled;
664  }
665  
666  /**
667   * Check the supplied configuration for correctness.
668   * @param conf Supplies the configuration to validate.
669   * @throws IOException if the configuration could not be queried.
670   * @throws IllegalArgumentException if the configuration is invalid.
671   */
672  private static void checkConfiguration(Configuration conf)
673      throws IOException {
674
675    final Collection<URI> namespaceDirs =
676        FSNamesystem.getNamespaceDirs(conf);
677    final Collection<URI> editsDirs =
678        FSNamesystem.getNamespaceEditsDirs(conf);
679    final Collection<URI> requiredEditsDirs =
680        FSNamesystem.getRequiredNamespaceEditsDirs(conf);
681    final Collection<URI> sharedEditsDirs =
682        FSNamesystem.getSharedEditsDirs(conf);
683
684    for (URI u : requiredEditsDirs) {
685      if (u.toString().compareTo(
686              DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_DEFAULT) == 0) {
687        continue;
688      }
689
690      // Each required directory must also be in editsDirs or in
691      // sharedEditsDirs.
692      if (!editsDirs.contains(u) &&
693          !sharedEditsDirs.contains(u)) {
694        throw new IllegalArgumentException(
695            "Required edits directory " + u.toString() + " not present in " +
696            DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY + ". " +
697            DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY + "=" +
698            editsDirs.toString() + "; " +
699            DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY + "=" +
700            requiredEditsDirs.toString() + ". " +
701            DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY + "=" +
702            sharedEditsDirs.toString() + ".");
703      }
704    }
705
706    if (namespaceDirs.size() == 1) {
707      LOG.warn("Only one image storage directory ("
708          + DFS_NAMENODE_NAME_DIR_KEY + ") configured. Beware of data loss"
709          + " due to lack of redundant storage directories!");
710    }
711    if (editsDirs.size() == 1) {
712      LOG.warn("Only one namespace edits storage directory ("
713          + DFS_NAMENODE_EDITS_DIR_KEY + ") configured. Beware of data loss"
714          + " due to lack of redundant storage directories!");
715    }
716  }
717
718  /**
719   * Instantiates an FSNamesystem loaded from the image and edits
720   * directories specified in the passed Configuration.
721   *
722   * @param conf the Configuration which specifies the storage directories
723   *             from which to load
724   * @return an FSNamesystem which contains the loaded namespace
725   * @throws IOException if loading fails
726   */
727  static FSNamesystem loadFromDisk(Configuration conf) throws IOException {
728
729    checkConfiguration(conf);
730    FSImage fsImage = new FSImage(conf,
731        FSNamesystem.getNamespaceDirs(conf),
732        FSNamesystem.getNamespaceEditsDirs(conf));
733    FSNamesystem namesystem = new FSNamesystem(conf, fsImage, false);
734    StartupOption startOpt = NameNode.getStartupOption(conf);
735    if (startOpt == StartupOption.RECOVER) {
736      namesystem.setSafeMode(SafeModeAction.SAFEMODE_ENTER);
737    }
738
739    long loadStart = now();
740    try {
741      namesystem.loadFSImage(startOpt);
742    } catch (IOException ioe) {
743      LOG.warn("Encountered exception loading fsimage", ioe);
744      fsImage.close();
745      throw ioe;
746    }
747    long timeTakenToLoadFSImage = now() - loadStart;
748    LOG.info("Finished loading FSImage in " + timeTakenToLoadFSImage + " msecs");
749    NameNodeMetrics nnMetrics = NameNode.getNameNodeMetrics();
750    if (nnMetrics != null) {
751      nnMetrics.setFsImageLoadTime((int) timeTakenToLoadFSImage);
752    }
753    return namesystem;
754  }
755  
756  FSNamesystem(Configuration conf, FSImage fsImage) throws IOException {
757    this(conf, fsImage, false);
758  }
759  
760  /**
761   * Create an FSNamesystem associated with the specified image.
762   * 
763   * Note that this does not load any data off of disk -- if you would
764   * like that behavior, use {@link #loadFromDisk(Configuration)}
765   *
766   * @param conf configuration
767   * @param fsImage The FSImage to associate with
768   * @param ignoreRetryCache Whether or not should ignore the retry cache setup
769   *                         step. For Secondary NN this should be set to true.
770   * @throws IOException on bad configuration
771   */
772  FSNamesystem(Configuration conf, FSImage fsImage, boolean ignoreRetryCache)
773      throws IOException {
774    provider = DFSUtil.createKeyProviderCryptoExtension(conf);
775    if (provider == null) {
776      LOG.info("No KeyProvider found.");
777    } else {
778      LOG.info("Found KeyProvider: " + provider.toString());
779    }
780    providerOptions = KeyProvider.options(conf);
781    this.codec = CryptoCodec.getInstance(conf);
782    if (conf.getBoolean(DFS_NAMENODE_AUDIT_LOG_ASYNC_KEY,
783                        DFS_NAMENODE_AUDIT_LOG_ASYNC_DEFAULT)) {
784      LOG.info("Enabling async auditlog");
785      enableAsyncAuditLog();
786    }
787    boolean fair = conf.getBoolean("dfs.namenode.fslock.fair", true);
788    LOG.info("fsLock is fair:" + fair);
789    fsLock = new FSNamesystemLock(fair);
790    cond = fsLock.writeLock().newCondition();
791    this.fsImage = fsImage;
792    try {
793      resourceRecheckInterval = conf.getLong(
794          DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY,
795          DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT);
796
797      this.blockManager = new BlockManager(this, this, conf);
798      this.datanodeStatistics = blockManager.getDatanodeManager().getDatanodeStatistics();
799      this.blockIdGenerator = new SequentialBlockIdGenerator(this.blockManager);
800
801      this.isStoragePolicyEnabled =
802          conf.getBoolean(DFS_STORAGE_POLICY_ENABLED_KEY,
803                          DFS_STORAGE_POLICY_ENABLED_DEFAULT);
804
805      this.fsOwner = UserGroupInformation.getCurrentUser();
806      this.fsOwnerShortUserName = fsOwner.getShortUserName();
807      this.supergroup = conf.get(DFS_PERMISSIONS_SUPERUSERGROUP_KEY, 
808                                 DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT);
809      this.isPermissionEnabled = conf.getBoolean(DFS_PERMISSIONS_ENABLED_KEY,
810                                                 DFS_PERMISSIONS_ENABLED_DEFAULT);
811      LOG.info("fsOwner             = " + fsOwner);
812      LOG.info("supergroup          = " + supergroup);
813      LOG.info("isPermissionEnabled = " + isPermissionEnabled);
814
815      // block allocation has to be persisted in HA using a shared edits directory
816      // so that the standby has up-to-date namespace information
817      nameserviceId = DFSUtil.getNamenodeNameServiceId(conf);
818      this.haEnabled = HAUtil.isHAEnabled(conf, nameserviceId);  
819      
820      // Sanity check the HA-related config.
821      if (nameserviceId != null) {
822        LOG.info("Determined nameservice ID: " + nameserviceId);
823      }
824      LOG.info("HA Enabled: " + haEnabled);
825      if (!haEnabled && HAUtil.usesSharedEditsDir(conf)) {
826        LOG.warn("Configured NNs:\n" + DFSUtil.nnAddressesAsString(conf));
827        throw new IOException("Invalid configuration: a shared edits dir " +
828            "must not be specified if HA is not enabled.");
829      }
830
831      // Get the checksum type from config
832      String checksumTypeStr = conf.get(DFS_CHECKSUM_TYPE_KEY, DFS_CHECKSUM_TYPE_DEFAULT);
833      DataChecksum.Type checksumType;
834      try {
835         checksumType = DataChecksum.Type.valueOf(checksumTypeStr);
836      } catch (IllegalArgumentException iae) {
837         throw new IOException("Invalid checksum type in "
838            + DFS_CHECKSUM_TYPE_KEY + ": " + checksumTypeStr);
839      }
840
841      this.serverDefaults = new FsServerDefaults(
842          conf.getLongBytes(DFS_BLOCK_SIZE_KEY, DFS_BLOCK_SIZE_DEFAULT),
843          conf.getInt(DFS_BYTES_PER_CHECKSUM_KEY, DFS_BYTES_PER_CHECKSUM_DEFAULT),
844          conf.getInt(DFS_CLIENT_WRITE_PACKET_SIZE_KEY, DFS_CLIENT_WRITE_PACKET_SIZE_DEFAULT),
845          (short) conf.getInt(DFS_REPLICATION_KEY, DFS_REPLICATION_DEFAULT),
846          conf.getInt(IO_FILE_BUFFER_SIZE_KEY, IO_FILE_BUFFER_SIZE_DEFAULT),
847          conf.getBoolean(DFS_ENCRYPT_DATA_TRANSFER_KEY, DFS_ENCRYPT_DATA_TRANSFER_DEFAULT),
848          conf.getLong(FS_TRASH_INTERVAL_KEY, FS_TRASH_INTERVAL_DEFAULT),
849          checksumType);
850      
851      this.maxFsObjects = conf.getLong(DFS_NAMENODE_MAX_OBJECTS_KEY, 
852                                       DFS_NAMENODE_MAX_OBJECTS_DEFAULT);
853
854      this.minBlockSize = conf.getLong(DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_KEY,
855          DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_DEFAULT);
856      this.maxBlocksPerFile = conf.getLong(DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_KEY,
857          DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_DEFAULT);
858      this.accessTimePrecision = conf.getLong(DFS_NAMENODE_ACCESSTIME_PRECISION_KEY,
859          DFS_NAMENODE_ACCESSTIME_PRECISION_DEFAULT);
860      this.supportAppends = conf.getBoolean(DFS_SUPPORT_APPEND_KEY, DFS_SUPPORT_APPEND_DEFAULT);
861      LOG.info("Append Enabled: " + supportAppends);
862
863      this.dtpReplaceDatanodeOnFailure = ReplaceDatanodeOnFailure.get(conf);
864      
865      this.standbyShouldCheckpoint = conf.getBoolean(
866          DFS_HA_STANDBY_CHECKPOINTS_KEY, DFS_HA_STANDBY_CHECKPOINTS_DEFAULT);
867      // # edit autoroll threshold is a multiple of the checkpoint threshold 
868      this.editLogRollerThreshold = (long)
869          (conf.getFloat(
870              DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD,
871              DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD_DEFAULT) *
872          conf.getLong(
873              DFS_NAMENODE_CHECKPOINT_TXNS_KEY,
874              DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT));
875      this.editLogRollerInterval = conf.getInt(
876          DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS,
877          DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS_DEFAULT);
878      this.inodeId = new INodeId();
879      
880      this.lazyPersistFileScrubIntervalSec = conf.getInt(
881          DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC,
882          DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC_DEFAULT);
883
884      if (this.lazyPersistFileScrubIntervalSec == 0) {
885        throw new IllegalArgumentException(
886            DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC + " must be non-zero.");
887      }
888
889      // For testing purposes, allow the DT secret manager to be started regardless
890      // of whether security is enabled.
891      alwaysUseDelegationTokensForTests = conf.getBoolean(
892          DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY,
893          DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT);
894      
895      this.dtSecretManager = createDelegationTokenSecretManager(conf);
896      this.dir = new FSDirectory(this, conf);
897      this.snapshotManager = new SnapshotManager(dir);
898      this.cacheManager = new CacheManager(this, conf, blockManager);
899      this.safeMode = new SafeModeInfo(conf);
900      this.auditLoggers = initAuditLoggers(conf);
901      this.isDefaultAuditLogger = auditLoggers.size() == 1 &&
902        auditLoggers.get(0) instanceof DefaultAuditLogger;
903      this.retryCache = ignoreRetryCache ? null : initRetryCache(conf);
904      this.nnConf = new NNConf(conf);
905    } catch(IOException e) {
906      LOG.error(getClass().getSimpleName() + " initialization failed.", e);
907      close();
908      throw e;
909    } catch (RuntimeException re) {
910      LOG.error(getClass().getSimpleName() + " initialization failed.", re);
911      close();
912      throw re;
913    }
914  }
915  
916  @VisibleForTesting
917  public RetryCache getRetryCache() {
918    return retryCache;
919  }
920
921  void lockRetryCache() {
922    if (retryCache != null) {
923      retryCache.lock();
924    }
925  }
926
927  void unlockRetryCache() {
928    if (retryCache != null) {
929      retryCache.unlock();
930    }
931  }
932
933  /** Whether or not retry cache is enabled */
934  boolean hasRetryCache() {
935    return retryCache != null;
936  }
937  
938  void addCacheEntryWithPayload(byte[] clientId, int callId, Object payload) {
939    if (retryCache != null) {
940      retryCache.addCacheEntryWithPayload(clientId, callId, payload);
941    }
942  }
943  
944  void addCacheEntry(byte[] clientId, int callId) {
945    if (retryCache != null) {
946      retryCache.addCacheEntry(clientId, callId);
947    }
948  }
949
950  @VisibleForTesting
951  public KeyProviderCryptoExtension getProvider() {
952    return provider;
953  }
954
955  @VisibleForTesting
956  static RetryCache initRetryCache(Configuration conf) {
957    boolean enable = conf.getBoolean(DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY,
958                                     DFS_NAMENODE_ENABLE_RETRY_CACHE_DEFAULT);
959    LOG.info("Retry cache on namenode is " + (enable ? "enabled" : "disabled"));
960    if (enable) {
961      float heapPercent = conf.getFloat(
962          DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_KEY,
963          DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_DEFAULT);
964      long entryExpiryMillis = conf.getLong(
965          DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_KEY,
966          DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_DEFAULT);
967      LOG.info("Retry cache will use " + heapPercent
968          + " of total heap and retry cache entry expiry time is "
969          + entryExpiryMillis + " millis");
970      long entryExpiryNanos = entryExpiryMillis * 1000 * 1000;
971      return new RetryCache("NameNodeRetryCache", heapPercent,
972          entryExpiryNanos);
973    }
974    return null;
975  }
976
977  private List<AuditLogger> initAuditLoggers(Configuration conf) {
978    // Initialize the custom access loggers if configured.
979    Collection<String> alClasses = conf.getStringCollection(DFS_NAMENODE_AUDIT_LOGGERS_KEY);
980    List<AuditLogger> auditLoggers = Lists.newArrayList();
981    if (alClasses != null && !alClasses.isEmpty()) {
982      for (String className : alClasses) {
983        try {
984          AuditLogger logger;
985          if (DFS_NAMENODE_DEFAULT_AUDIT_LOGGER_NAME.equals(className)) {
986            logger = new DefaultAuditLogger();
987          } else {
988            logger = (AuditLogger) Class.forName(className).newInstance();
989          }
990          logger.initialize(conf);
991          auditLoggers.add(logger);
992        } catch (RuntimeException re) {
993          throw re;
994        } catch (Exception e) {
995          throw new RuntimeException(e);
996        }
997      }
998    }
999
1000    // Make sure there is at least one logger installed.
1001    if (auditLoggers.isEmpty()) {
1002      auditLoggers.add(new DefaultAuditLogger());
1003    }
1004    return Collections.unmodifiableList(auditLoggers);
1005  }
1006
1007  private void loadFSImage(StartupOption startOpt) throws IOException {
1008    final FSImage fsImage = getFSImage();
1009
1010    // format before starting up if requested
1011    if (startOpt == StartupOption.FORMAT) {
1012      
1013      fsImage.format(this, fsImage.getStorage().determineClusterId());// reuse current id
1014
1015      startOpt = StartupOption.REGULAR;
1016    }
1017    boolean success = false;
1018    writeLock();
1019    try {
1020      // We shouldn't be calling saveNamespace if we've come up in standby state.
1021      MetaRecoveryContext recovery = startOpt.createRecoveryContext();
1022      final boolean staleImage
1023          = fsImage.recoverTransitionRead(startOpt, this, recovery);
1024      if (RollingUpgradeStartupOption.ROLLBACK.matches(startOpt) ||
1025          RollingUpgradeStartupOption.DOWNGRADE.matches(startOpt)) {
1026        rollingUpgradeInfo = null;
1027      }
1028      final boolean needToSave = staleImage && !haEnabled && !isRollingUpgrade(); 
1029      LOG.info("Need to save fs image? " + needToSave
1030          + " (staleImage=" + staleImage + ", haEnabled=" + haEnabled
1031          + ", isRollingUpgrade=" + isRollingUpgrade() + ")");
1032      if (needToSave) {
1033        fsImage.saveNamespace(this);
1034      } else {
1035        updateStorageVersionForRollingUpgrade(fsImage.getLayoutVersion(),
1036            startOpt);
1037        // No need to save, so mark the phase done.
1038        StartupProgress prog = NameNode.getStartupProgress();
1039        prog.beginPhase(Phase.SAVING_CHECKPOINT);
1040        prog.endPhase(Phase.SAVING_CHECKPOINT);
1041      }
1042      // This will start a new log segment and write to the seen_txid file, so
1043      // we shouldn't do it when coming up in standby state
1044      if (!haEnabled || (haEnabled && startOpt == StartupOption.UPGRADE)
1045          || (haEnabled && startOpt == StartupOption.UPGRADEONLY)) {
1046        fsImage.openEditLogForWrite();
1047      }
1048      success = true;
1049    } finally {
1050      if (!success) {
1051        fsImage.close();
1052      }
1053      writeUnlock();
1054    }
1055    imageLoadComplete();
1056  }
1057
1058  private void updateStorageVersionForRollingUpgrade(final long layoutVersion,
1059      StartupOption startOpt) throws IOException {
1060    boolean rollingStarted = RollingUpgradeStartupOption.STARTED
1061        .matches(startOpt) && layoutVersion > HdfsConstants
1062        .NAMENODE_LAYOUT_VERSION;
1063    boolean rollingRollback = RollingUpgradeStartupOption.ROLLBACK
1064        .matches(startOpt);
1065    if (rollingRollback || rollingStarted) {
1066      fsImage.updateStorageVersion();
1067    }
1068  }
1069
1070  private void startSecretManager() {
1071    if (dtSecretManager != null) {
1072      try {
1073        dtSecretManager.startThreads();
1074      } catch (IOException e) {
1075        // Inability to start secret manager
1076        // can't be recovered from.
1077        throw new RuntimeException(e);
1078      }
1079    }
1080  }
1081  
1082  private void startSecretManagerIfNecessary() {
1083    boolean shouldRun = shouldUseDelegationTokens() &&
1084      !isInSafeMode() && getEditLog().isOpenForWrite();
1085    boolean running = dtSecretManager.isRunning();
1086    if (shouldRun && !running) {
1087      startSecretManager();
1088    }
1089  }
1090
1091  private void stopSecretManager() {
1092    if (dtSecretManager != null) {
1093      dtSecretManager.stopThreads();
1094    }
1095  }
1096  
1097  /** 
1098   * Start services common to both active and standby states
1099   */
1100  void startCommonServices(Configuration conf, HAContext haContext) throws IOException {
1101    this.registerMBean(); // register the MBean for the FSNamesystemState
1102    writeLock();
1103    this.haContext = haContext;
1104    try {
1105      nnResourceChecker = new NameNodeResourceChecker(conf);
1106      checkAvailableResources();
1107      assert safeMode != null && !isPopulatingReplQueues();
1108      StartupProgress prog = NameNode.getStartupProgress();
1109      prog.beginPhase(Phase.SAFEMODE);
1110      prog.setTotal(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS,
1111        getCompleteBlocksTotal());
1112      setBlockTotal();
1113      blockManager.activate(conf);
1114    } finally {
1115      writeUnlock();
1116    }
1117    
1118    registerMXBean();
1119    DefaultMetricsSystem.instance().register(this);
1120    snapshotManager.registerMXBean();
1121  }
1122  
1123  /** 
1124   * Stop services common to both active and standby states
1125   */
1126  void stopCommonServices() {
1127    writeLock();
1128    try {
1129      if (blockManager != null) blockManager.close();
1130    } finally {
1131      writeUnlock();
1132    }
1133    RetryCache.clear(retryCache);
1134  }
1135  
1136  /**
1137   * Start services required in active state
1138   * @throws IOException
1139   */
1140  void startActiveServices() throws IOException {
1141    startingActiveService = true;
1142    LOG.info("Starting services required for active state");
1143    writeLock();
1144    try {
1145      FSEditLog editLog = getFSImage().getEditLog();
1146      
1147      if (!editLog.isOpenForWrite()) {
1148        // During startup, we're already open for write during initialization.
1149        editLog.initJournalsForWrite();
1150        // May need to recover
1151        editLog.recoverUnclosedStreams();
1152        
1153        LOG.info("Catching up to latest edits from old active before " +
1154            "taking over writer role in edits logs");
1155        editLogTailer.catchupDuringFailover();
1156        
1157        blockManager.setPostponeBlocksFromFuture(false);
1158        blockManager.getDatanodeManager().markAllDatanodesStale();
1159        blockManager.clearQueues();
1160        blockManager.processAllPendingDNMessages();
1161
1162        // Only need to re-process the queue, If not in SafeMode.
1163        if (!isInSafeMode()) {
1164          LOG.info("Reprocessing replication and invalidation queues");
1165          initializeReplQueues();
1166        }
1167
1168        if (LOG.isDebugEnabled()) {
1169          LOG.debug("NameNode metadata after re-processing " +
1170              "replication and invalidation queues during failover:\n" +
1171              metaSaveAsString());
1172        }
1173        
1174        long nextTxId = getFSImage().getLastAppliedTxId() + 1;
1175        LOG.info("Will take over writing edit logs at txnid " + 
1176            nextTxId);
1177        editLog.setNextTxId(nextTxId);
1178
1179        getFSImage().editLog.openForWrite();
1180      }
1181
1182      // Enable quota checks.
1183      dir.enableQuotaChecks();
1184      if (haEnabled) {
1185        // Renew all of the leases before becoming active.
1186        // This is because, while we were in standby mode,
1187        // the leases weren't getting renewed on this NN.
1188        // Give them all a fresh start here.
1189        leaseManager.renewAllLeases();
1190      }
1191      leaseManager.startMonitor();
1192      startSecretManagerIfNecessary();
1193
1194      //ResourceMonitor required only at ActiveNN. See HDFS-2914
1195      this.nnrmthread = new Daemon(new NameNodeResourceMonitor());
1196      nnrmthread.start();
1197
1198      nnEditLogRoller = new Daemon(new NameNodeEditLogRoller(
1199          editLogRollerThreshold, editLogRollerInterval));
1200      nnEditLogRoller.start();
1201
1202      if (lazyPersistFileScrubIntervalSec > 0) {
1203        lazyPersistFileScrubber = new Daemon(new LazyPersistFileScrubber(
1204            lazyPersistFileScrubIntervalSec));
1205        lazyPersistFileScrubber.start();
1206      }
1207
1208      cacheManager.startMonitorThread();
1209      blockManager.getDatanodeManager().setShouldSendCachingCommands(true);
1210    } finally {
1211      startingActiveService = false;
1212      checkSafeMode();
1213      writeUnlock();
1214    }
1215  }
1216
1217  /**
1218   * Initialize replication queues.
1219   */
1220  private void initializeReplQueues() {
1221    LOG.info("initializing replication queues");
1222    blockManager.processMisReplicatedBlocks();
1223    initializedReplQueues = true;
1224  }
1225
1226  private boolean inActiveState() {
1227    return haContext != null &&
1228        haContext.getState().getServiceState() == HAServiceState.ACTIVE;
1229  }
1230
1231  /**
1232   * @return Whether the namenode is transitioning to active state and is in the
1233   *         middle of the {@link #startActiveServices()}
1234   */
1235  public boolean inTransitionToActive() {
1236    return haEnabled && inActiveState() && startingActiveService;
1237  }
1238
1239  private boolean shouldUseDelegationTokens() {
1240    return UserGroupInformation.isSecurityEnabled() ||
1241      alwaysUseDelegationTokensForTests;
1242  }
1243
1244  /** 
1245   * Stop services required in active state
1246   */
1247  void stopActiveServices() {
1248    LOG.info("Stopping services started for active state");
1249    writeLock();
1250    try {
1251      stopSecretManager();
1252      leaseManager.stopMonitor();
1253      if (nnrmthread != null) {
1254        ((NameNodeResourceMonitor) nnrmthread.getRunnable()).stopMonitor();
1255        nnrmthread.interrupt();
1256      }
1257      if (nnEditLogRoller != null) {
1258        ((NameNodeEditLogRoller)nnEditLogRoller.getRunnable()).stop();
1259        nnEditLogRoller.interrupt();
1260      }
1261      if (lazyPersistFileScrubber != null) {
1262        ((LazyPersistFileScrubber) lazyPersistFileScrubber.getRunnable()).stop();
1263        lazyPersistFileScrubber.interrupt();
1264      }
1265      if (dir != null && getFSImage() != null) {
1266        if (getFSImage().editLog != null) {
1267          getFSImage().editLog.close();
1268        }
1269        // Update the fsimage with the last txid that we wrote
1270        // so that the tailer starts from the right spot.
1271        getFSImage().updateLastAppliedTxIdFromWritten();
1272      }
1273      if (cacheManager != null) {
1274        cacheManager.stopMonitorThread();
1275        cacheManager.clearDirectiveStats();
1276      }
1277      blockManager.getDatanodeManager().clearPendingCachingCommands();
1278      blockManager.getDatanodeManager().setShouldSendCachingCommands(false);
1279      // Don't want to keep replication queues when not in Active.
1280      blockManager.clearQueues();
1281      initializedReplQueues = false;
1282    } finally {
1283      writeUnlock();
1284    }
1285  }
1286  
1287  /**
1288   * Start services required in standby state 
1289   * 
1290   * @throws IOException
1291   */
1292  void startStandbyServices(final Configuration conf) throws IOException {
1293    LOG.info("Starting services required for standby state");
1294    if (!getFSImage().editLog.isOpenForRead()) {
1295      // During startup, we're already open for read.
1296      getFSImage().editLog.initSharedJournalsForRead();
1297    }
1298    
1299    blockManager.setPostponeBlocksFromFuture(true);
1300
1301    // Disable quota checks while in standby.
1302    dir.disableQuotaChecks();
1303    editLogTailer = new EditLogTailer(this, conf);
1304    editLogTailer.start();
1305    if (standbyShouldCheckpoint) {
1306      standbyCheckpointer = new StandbyCheckpointer(conf, this);
1307      standbyCheckpointer.start();
1308    }
1309  }
1310
1311  /**
1312   * Called when the NN is in Standby state and the editlog tailer tails the
1313   * OP_ROLLING_UPGRADE_START.
1314   */
1315  void triggerRollbackCheckpoint() {
1316    setNeedRollbackFsImage(true);
1317    if (standbyCheckpointer != null) {
1318      standbyCheckpointer.triggerRollbackCheckpoint();
1319    }
1320  }
1321
1322  /**
1323   * Called while the NN is in Standby state, but just about to be
1324   * asked to enter Active state. This cancels any checkpoints
1325   * currently being taken.
1326   */
1327  void prepareToStopStandbyServices() throws ServiceFailedException {
1328    if (standbyCheckpointer != null) {
1329      standbyCheckpointer.cancelAndPreventCheckpoints(
1330          "About to leave standby state");
1331    }
1332  }
1333
1334  /** Stop services required in standby state */
1335  void stopStandbyServices() throws IOException {
1336    LOG.info("Stopping services started for standby state");
1337    if (standbyCheckpointer != null) {
1338      standbyCheckpointer.stop();
1339    }
1340    if (editLogTailer != null) {
1341      editLogTailer.stop();
1342    }
1343    if (dir != null && getFSImage() != null && getFSImage().editLog != null) {
1344      getFSImage().editLog.close();
1345    }
1346  }
1347  
1348  @Override
1349  public void checkOperation(OperationCategory op) throws StandbyException {
1350    if (haContext != null) {
1351      // null in some unit tests
1352      haContext.checkOperation(op);
1353    }
1354  }
1355  
1356  /**
1357   * @throws RetriableException
1358   *           If 1) The NameNode is in SafeMode, 2) HA is enabled, and 3)
1359   *           NameNode is in active state
1360   * @throws SafeModeException
1361   *           Otherwise if NameNode is in SafeMode.
1362   */
1363  private void checkNameNodeSafeMode(String errorMsg)
1364      throws RetriableException, SafeModeException {
1365    if (isInSafeMode()) {
1366      SafeModeException se = new SafeModeException(errorMsg, safeMode);
1367      if (haEnabled && haContext != null
1368          && haContext.getState().getServiceState() == HAServiceState.ACTIVE
1369          && shouldRetrySafeMode(this.safeMode)) {
1370        throw new RetriableException(se);
1371      } else {
1372        throw se;
1373      }
1374    }
1375  }
1376  
1377  /**
1378   * We already know that the safemode is on. We will throw a RetriableException
1379   * if the safemode is not manual or caused by low resource.
1380   */
1381  private boolean shouldRetrySafeMode(SafeModeInfo safeMode) {
1382    if (safeMode == null) {
1383      return false;
1384    } else {
1385      return !safeMode.isManual() && !safeMode.areResourcesLow();
1386    }
1387  }
1388  
1389  public static Collection<URI> getNamespaceDirs(Configuration conf) {
1390    return getStorageDirs(conf, DFS_NAMENODE_NAME_DIR_KEY);
1391  }
1392
1393  /**
1394   * Get all edits dirs which are required. If any shared edits dirs are
1395   * configured, these are also included in the set of required dirs.
1396   * 
1397   * @param conf the HDFS configuration.
1398   * @return all required dirs.
1399   */
1400  public static Collection<URI> getRequiredNamespaceEditsDirs(Configuration conf) {
1401    Set<URI> ret = new HashSet<URI>();
1402    ret.addAll(getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY));
1403    ret.addAll(getSharedEditsDirs(conf));
1404    return ret;
1405  }
1406
1407  private static Collection<URI> getStorageDirs(Configuration conf,
1408                                                String propertyName) {
1409    Collection<String> dirNames = conf.getTrimmedStringCollection(propertyName);
1410    StartupOption startOpt = NameNode.getStartupOption(conf);
1411    if(startOpt == StartupOption.IMPORT) {
1412      // In case of IMPORT this will get rid of default directories 
1413      // but will retain directories specified in hdfs-site.xml
1414      // When importing image from a checkpoint, the name-node can
1415      // start with empty set of storage directories.
1416      Configuration cE = new HdfsConfiguration(false);
1417      cE.addResource("core-default.xml");
1418      cE.addResource("core-site.xml");
1419      cE.addResource("hdfs-default.xml");
1420      Collection<String> dirNames2 = cE.getTrimmedStringCollection(propertyName);
1421      dirNames.removeAll(dirNames2);
1422      if(dirNames.isEmpty())
1423        LOG.warn("!!! WARNING !!!" +
1424          "\n\tThe NameNode currently runs without persistent storage." +
1425          "\n\tAny changes to the file system meta-data may be lost." +
1426          "\n\tRecommended actions:" +
1427          "\n\t\t- shutdown and restart NameNode with configured \"" 
1428          + propertyName + "\" in hdfs-site.xml;" +
1429          "\n\t\t- use Backup Node as a persistent and up-to-date storage " +
1430          "of the file system meta-data.");
1431    } else if (dirNames.isEmpty()) {
1432      dirNames = Collections.singletonList(
1433          DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_DEFAULT);
1434    }
1435    return Util.stringCollectionAsURIs(dirNames);
1436  }
1437
1438  /**
1439   * Return an ordered list of edits directories to write to.
1440   * The list is ordered such that all shared edits directories
1441   * are ordered before non-shared directories, and any duplicates
1442   * are removed. The order they are specified in the configuration
1443   * is retained.
1444   * @return Collection of shared edits directories.
1445   * @throws IOException if multiple shared edits directories are configured
1446   */
1447  public static List<URI> getNamespaceEditsDirs(Configuration conf)
1448      throws IOException {
1449    return getNamespaceEditsDirs(conf, true);
1450  }
1451  
1452  public static List<URI> getNamespaceEditsDirs(Configuration conf,
1453      boolean includeShared)
1454      throws IOException {
1455    // Use a LinkedHashSet so that order is maintained while we de-dup
1456    // the entries.
1457    LinkedHashSet<URI> editsDirs = new LinkedHashSet<URI>();
1458    
1459    if (includeShared) {
1460      List<URI> sharedDirs = getSharedEditsDirs(conf);
1461  
1462      // Fail until multiple shared edits directories are supported (HDFS-2782)
1463      if (sharedDirs.size() > 1) {
1464        throw new IOException(
1465            "Multiple shared edits directories are not yet supported");
1466      }
1467  
1468      // First add the shared edits dirs. It's critical that the shared dirs
1469      // are added first, since JournalSet syncs them in the order they are listed,
1470      // and we need to make sure all edits are in place in the shared storage
1471      // before they are replicated locally. See HDFS-2874.
1472      for (URI dir : sharedDirs) {
1473        if (!editsDirs.add(dir)) {
1474          LOG.warn("Edits URI " + dir + " listed multiple times in " + 
1475              DFS_NAMENODE_SHARED_EDITS_DIR_KEY + ". Ignoring duplicates.");
1476        }
1477      }
1478    }    
1479    // Now add the non-shared dirs.
1480    for (URI dir : getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_KEY)) {
1481      if (!editsDirs.add(dir)) {
1482        LOG.warn("Edits URI " + dir + " listed multiple times in " + 
1483            DFS_NAMENODE_SHARED_EDITS_DIR_KEY + " and " +
1484            DFS_NAMENODE_EDITS_DIR_KEY + ". Ignoring duplicates.");
1485      }
1486    }
1487
1488    if (editsDirs.isEmpty()) {
1489      // If this is the case, no edit dirs have been explicitly configured.
1490      // Image dirs are to be used for edits too.
1491      return Lists.newArrayList(getNamespaceDirs(conf));
1492    } else {
1493      return Lists.newArrayList(editsDirs);
1494    }
1495  }
1496  
1497  /**
1498   * Returns edit directories that are shared between primary and secondary.
1499   * @param conf configuration
1500   * @return collection of edit directories from {@code conf}
1501   */
1502  public static List<URI> getSharedEditsDirs(Configuration conf) {
1503    // don't use getStorageDirs here, because we want an empty default
1504    // rather than the dir in /tmp
1505    Collection<String> dirNames = conf.getTrimmedStringCollection(
1506        DFS_NAMENODE_SHARED_EDITS_DIR_KEY);
1507    return Util.stringCollectionAsURIs(dirNames);
1508  }
1509
1510  @Override
1511  public void readLock() {
1512    this.fsLock.readLock().lock();
1513  }
1514  @Override
1515  public void longReadLockInterruptibly() throws InterruptedException {
1516    this.fsLock.longReadLock().lockInterruptibly();
1517    try {
1518      this.fsLock.readLock().lockInterruptibly();
1519    } catch (InterruptedException ie) {
1520      // In the event we're interrupted while getting the normal FSNS read lock,
1521      // release the long read lock.
1522      this.fsLock.longReadLock().unlock();
1523      throw ie;
1524    }
1525  }
1526  @Override
1527  public void longReadUnlock() {
1528    this.fsLock.readLock().unlock();
1529    this.fsLock.longReadLock().unlock();
1530  }
1531  @Override
1532  public void readUnlock() {
1533    this.fsLock.readLock().unlock();
1534  }
1535  @Override
1536  public void writeLock() {
1537    this.fsLock.longReadLock().lock();
1538    this.fsLock.writeLock().lock();
1539  }
1540  @Override
1541  public void writeLockInterruptibly() throws InterruptedException {
1542    this.fsLock.longReadLock().lockInterruptibly();
1543    try {
1544      this.fsLock.writeLock().lockInterruptibly();
1545    } catch (InterruptedException ie) {
1546      // In the event we're interrupted while getting the normal FSNS write
1547      // lock, release the long read lock.
1548      this.fsLock.longReadLock().unlock();
1549      throw ie;
1550    }
1551  }
1552  @Override
1553  public void writeUnlock() {
1554    this.fsLock.writeLock().unlock();
1555    this.fsLock.longReadLock().unlock();
1556  }
1557  @Override
1558  public boolean hasWriteLock() {
1559    return this.fsLock.isWriteLockedByCurrentThread();
1560  }
1561  @Override
1562  public boolean hasReadLock() {
1563    return this.fsLock.getReadHoldCount() > 0 || hasWriteLock();
1564  }
1565
1566  public int getReadHoldCount() {
1567    return this.fsLock.getReadHoldCount();
1568  }
1569
1570  public int getWriteHoldCount() {
1571    return this.fsLock.getWriteHoldCount();
1572  }
1573
1574  NamespaceInfo getNamespaceInfo() {
1575    readLock();
1576    try {
1577      return unprotectedGetNamespaceInfo();
1578    } finally {
1579      readUnlock();
1580    }
1581  }
1582
1583  /**
1584   * Version of @see #getNamespaceInfo() that is not protected by a lock.
1585   */
1586  NamespaceInfo unprotectedGetNamespaceInfo() {
1587    return new NamespaceInfo(getFSImage().getStorage().getNamespaceID(),
1588        getClusterId(), getBlockPoolId(),
1589        getFSImage().getStorage().getCTime());
1590  }
1591
1592  /**
1593   * Close down this file system manager.
1594   * Causes heartbeat and lease daemons to stop; waits briefly for
1595   * them to finish, but a short timeout returns control back to caller.
1596   */
1597  void close() {
1598    fsRunning = false;
1599    try {
1600      stopCommonServices();
1601      if (smmthread != null) smmthread.interrupt();
1602    } finally {
1603      // using finally to ensure we also wait for lease daemon
1604      try {
1605        stopActiveServices();
1606        stopStandbyServices();
1607      } catch (IOException ie) {
1608      } finally {
1609        IOUtils.cleanup(LOG, dir);
1610        IOUtils.cleanup(LOG, fsImage);
1611      }
1612    }
1613  }
1614
1615  @Override
1616  public boolean isRunning() {
1617    return fsRunning;
1618  }
1619  
1620  @Override
1621  public boolean isInStandbyState() {
1622    if (haContext == null || haContext.getState() == null) {
1623      // We're still starting up. In this case, if HA is
1624      // on for the cluster, we always start in standby. Otherwise
1625      // start in active.
1626      return haEnabled;
1627    }
1628
1629    return HAServiceState.STANDBY == haContext.getState().getServiceState();
1630  }
1631
1632  /**
1633   * Dump all metadata into specified file
1634   */
1635  void metaSave(String filename) throws IOException {
1636    checkSuperuserPrivilege();
1637    checkOperation(OperationCategory.UNCHECKED);
1638    writeLock();
1639    try {
1640      checkOperation(OperationCategory.UNCHECKED);
1641      File file = new File(System.getProperty("hadoop.log.dir"), filename);
1642      PrintWriter out = new PrintWriter(new BufferedWriter(
1643          new OutputStreamWriter(new FileOutputStream(file), Charsets.UTF_8)));
1644      metaSave(out);
1645      out.flush();
1646      out.close();
1647    } finally {
1648      writeUnlock();
1649    }
1650  }
1651
1652  private void metaSave(PrintWriter out) {
1653    assert hasWriteLock();
1654    long totalInodes = this.dir.totalInodes();
1655    long totalBlocks = this.getBlocksTotal();
1656    out.println(totalInodes + " files and directories, " + totalBlocks
1657        + " blocks = " + (totalInodes + totalBlocks) + " total");
1658
1659    blockManager.metaSave(out);
1660  }
1661
1662  private String metaSaveAsString() {
1663    StringWriter sw = new StringWriter();
1664    PrintWriter pw = new PrintWriter(sw);
1665    metaSave(pw);
1666    pw.flush();
1667    return sw.toString();
1668  }
1669  
1670
1671  long getDefaultBlockSize() {
1672    return serverDefaults.getBlockSize();
1673  }
1674
1675  FsServerDefaults getServerDefaults() throws StandbyException {
1676    checkOperation(OperationCategory.READ);
1677    return serverDefaults;
1678  }
1679
1680  long getAccessTimePrecision() {
1681    return accessTimePrecision;
1682  }
1683
1684  private boolean isAccessTimeSupported() {
1685    return accessTimePrecision > 0;
1686  }
1687
1688  /////////////////////////////////////////////////////////
1689  //
1690  // These methods are called by HadoopFS clients
1691  //
1692  /////////////////////////////////////////////////////////
1693  /**
1694   * Set permissions for an existing file.
1695   * @throws IOException
1696   */
1697  void setPermission(String src, FsPermission permission)
1698      throws AccessControlException, FileNotFoundException, SafeModeException,
1699      UnresolvedLinkException, IOException {
1700    try {
1701      setPermissionInt(src, permission);
1702    } catch (AccessControlException e) {
1703      logAuditEvent(false, "setPermission", src);
1704      throw e;
1705    }
1706  }
1707
1708  private void setPermissionInt(final String srcArg, FsPermission permission)
1709      throws AccessControlException, FileNotFoundException, SafeModeException,
1710      UnresolvedLinkException, IOException {
1711    String src = srcArg;
1712    HdfsFileStatus resultingStat = null;
1713    FSPermissionChecker pc = getPermissionChecker();
1714    checkOperation(OperationCategory.WRITE);
1715    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
1716    writeLock();
1717    try {
1718      checkOperation(OperationCategory.WRITE);
1719      checkNameNodeSafeMode("Cannot set permission for " + src);
1720      src = resolvePath(src, pathComponents);
1721      checkOwner(pc, src);
1722      dir.setPermission(src, permission);
1723      getEditLog().logSetPermissions(src, permission);
1724      resultingStat = getAuditFileInfo(src, false);
1725    } finally {
1726      writeUnlock();
1727    }
1728    getEditLog().logSync();
1729    logAuditEvent(true, "setPermission", srcArg, null, resultingStat);
1730  }
1731
1732  /**
1733   * Set owner for an existing file.
1734   * @throws IOException
1735   */
1736  void setOwner(String src, String username, String group)
1737      throws AccessControlException, FileNotFoundException, SafeModeException,
1738      UnresolvedLinkException, IOException {
1739    try {
1740      setOwnerInt(src, username, group);
1741    } catch (AccessControlException e) {
1742      logAuditEvent(false, "setOwner", src);
1743      throw e;
1744    } 
1745  }
1746
1747  private void setOwnerInt(final String srcArg, String username, String group)
1748      throws AccessControlException, FileNotFoundException, SafeModeException,
1749      UnresolvedLinkException, IOException {
1750    String src = srcArg;
1751    HdfsFileStatus resultingStat = null;
1752    FSPermissionChecker pc = getPermissionChecker();
1753    checkOperation(OperationCategory.WRITE);
1754    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
1755    writeLock();
1756    try {
1757      checkOperation(OperationCategory.WRITE);
1758      checkNameNodeSafeMode("Cannot set owner for " + src);
1759      src = resolvePath(src, pathComponents);
1760      checkOwner(pc, src);
1761      if (!pc.isSuperUser()) {
1762        if (username != null && !pc.getUser().equals(username)) {
1763          throw new AccessControlException("Non-super user cannot change owner");
1764        }
1765        if (group != null && !pc.containsGroup(group)) {
1766          throw new AccessControlException("User does not belong to " + group);
1767        }
1768      }
1769      dir.setOwner(src, username, group);
1770      getEditLog().logSetOwner(src, username, group);
1771      resultingStat = getAuditFileInfo(src, false);
1772    } finally {
1773      writeUnlock();
1774    }
1775    getEditLog().logSync();
1776    logAuditEvent(true, "setOwner", srcArg, null, resultingStat);
1777  }
1778
1779  /**
1780   * Get block locations within the specified range.
1781   * @see ClientProtocol#getBlockLocations(String, long, long)
1782   */
1783  LocatedBlocks getBlockLocations(String clientMachine, String src,
1784      long offset, long length) throws AccessControlException,
1785      FileNotFoundException, UnresolvedLinkException, IOException {
1786    LocatedBlocks blocks = getBlockLocations(src, offset, length, true, true,
1787        true);
1788    if (blocks != null) {
1789      blockManager.getDatanodeManager().sortLocatedBlocks(clientMachine,
1790          blocks.getLocatedBlocks());
1791
1792      // lastBlock is not part of getLocatedBlocks(), might need to sort it too
1793      LocatedBlock lastBlock = blocks.getLastLocatedBlock();
1794      if (lastBlock != null) {
1795        ArrayList<LocatedBlock> lastBlockList =
1796            Lists.newArrayListWithCapacity(1);
1797        lastBlockList.add(lastBlock);
1798        blockManager.getDatanodeManager().sortLocatedBlocks(clientMachine,
1799            lastBlockList);
1800      }
1801    }
1802    return blocks;
1803  }
1804
1805  /**
1806   * Get block locations within the specified range.
1807   * @see ClientProtocol#getBlockLocations(String, long, long)
1808   * @throws FileNotFoundException, UnresolvedLinkException, IOException
1809   */
1810  LocatedBlocks getBlockLocations(String src, long offset, long length,
1811      boolean doAccessTime, boolean needBlockToken, boolean checkSafeMode)
1812      throws FileNotFoundException, UnresolvedLinkException, IOException {
1813    try {
1814      return getBlockLocationsInt(src, offset, length, doAccessTime,
1815                                  needBlockToken, checkSafeMode);
1816    } catch (AccessControlException e) {
1817      logAuditEvent(false, "open", src);
1818      throw e;
1819    }
1820  }
1821
1822  private LocatedBlocks getBlockLocationsInt(String src, long offset,
1823      long length, boolean doAccessTime, boolean needBlockToken,
1824      boolean checkSafeMode)
1825      throws FileNotFoundException, UnresolvedLinkException, IOException {
1826    if (offset < 0) {
1827      throw new HadoopIllegalArgumentException(
1828          "Negative offset is not supported. File: " + src);
1829    }
1830    if (length < 0) {
1831      throw new HadoopIllegalArgumentException(
1832          "Negative length is not supported. File: " + src);
1833    }
1834    final LocatedBlocks ret = getBlockLocationsUpdateTimes(src,
1835        offset, length, doAccessTime, needBlockToken);  
1836    logAuditEvent(true, "open", src);
1837    if (checkSafeMode && isInSafeMode()) {
1838      for (LocatedBlock b : ret.getLocatedBlocks()) {
1839        // if safemode & no block locations yet then throw safemodeException
1840        if ((b.getLocations() == null) || (b.getLocations().length == 0)) {
1841          SafeModeException se = new SafeModeException(
1842              "Zero blocklocations for " + src, safeMode);
1843          if (haEnabled && haContext != null && 
1844              haContext.getState().getServiceState() == HAServiceState.ACTIVE) {
1845            throw new RetriableException(se);
1846          } else {
1847            throw se;
1848          }
1849        }
1850      }
1851    }
1852    return ret;
1853  }
1854
1855  /*
1856   * Get block locations within the specified range, updating the
1857   * access times if necessary. 
1858   */
1859  private LocatedBlocks getBlockLocationsUpdateTimes(final String srcArg,
1860      long offset, long length, boolean doAccessTime, boolean needBlockToken)
1861      throws FileNotFoundException,
1862      UnresolvedLinkException, IOException {
1863    String src = srcArg;
1864    FSPermissionChecker pc = getPermissionChecker();
1865    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
1866    for (int attempt = 0; attempt < 2; attempt++) {
1867      boolean isReadOp = (attempt == 0);
1868      if (isReadOp) { // first attempt is with readlock
1869        checkOperation(OperationCategory.READ);
1870        readLock();
1871      }  else { // second attempt is with  write lock
1872        checkOperation(OperationCategory.WRITE);
1873        writeLock(); // writelock is needed to set accesstime
1874      }
1875      try {
1876        src = resolvePath(src, pathComponents);
1877        if (isReadOp) {
1878          checkOperation(OperationCategory.READ);
1879        } else {
1880          checkOperation(OperationCategory.WRITE);
1881        }
1882        if (isPermissionEnabled) {
1883          checkPathAccess(pc, src, FsAction.READ);
1884        }
1885
1886        // if the namenode is in safemode, then do not update access time
1887        if (isInSafeMode()) {
1888          doAccessTime = false;
1889        }
1890
1891        final INodesInPath iip = dir.getINodesInPath(src, true);
1892        final INode[] inodes = iip.getINodes();
1893        final INodeFile inode = INodeFile.valueOf(
1894            inodes[inodes.length - 1], src);
1895        if (isPermissionEnabled) {
1896          checkUnreadableBySuperuser(pc, inode, iip.getPathSnapshotId());
1897        }
1898        if (!iip.isSnapshot() //snapshots are readonly, so don't update atime.
1899            && doAccessTime && isAccessTimeSupported()) {
1900          final long now = now();
1901          if (now > inode.getAccessTime() + getAccessTimePrecision()) {
1902            // if we have to set access time but we only have the readlock, then
1903            // restart this entire operation with the writeLock.
1904            if (isReadOp) {
1905              continue;
1906            }
1907            boolean changed = dir.setTimes(inode, -1, now, false,
1908                    iip.getLatestSnapshotId());
1909            if (changed) {
1910              getEditLog().logTimes(src, -1, now);
1911            }
1912          }
1913        }
1914        final long fileSize = iip.isSnapshot() ?
1915            inode.computeFileSize(iip.getPathSnapshotId())
1916            : inode.computeFileSizeNotIncludingLastUcBlock();
1917        boolean isUc = inode.isUnderConstruction();
1918        if (iip.isSnapshot()) {
1919          // if src indicates a snapshot file, we need to make sure the returned
1920          // blocks do not exceed the size of the snapshot file.
1921          length = Math.min(length, fileSize - offset);
1922          isUc = false;
1923        }
1924
1925        final FileEncryptionInfo feInfo =
1926          FSDirectory.isReservedRawName(srcArg) ?
1927          null : dir.getFileEncryptionInfo(inode, iip.getPathSnapshotId(),
1928              iip);
1929
1930        final LocatedBlocks blocks =
1931          blockManager.createLocatedBlocks(inode.getBlocks(), fileSize,
1932            isUc, offset, length, needBlockToken, iip.isSnapshot(), feInfo);
1933        // Set caching information for the located blocks.
1934        for (LocatedBlock lb: blocks.getLocatedBlocks()) {
1935          cacheManager.setCachedLocations(lb);
1936        }
1937        return blocks;
1938      } finally {
1939        if (isReadOp) {
1940          readUnlock();
1941        } else {
1942          writeUnlock();
1943        }
1944      }
1945    }
1946    return null; // can never reach here
1947  }
1948
1949  /**
1950   * Moves all the blocks from {@code srcs} and appends them to {@code target}
1951   * To avoid rollbacks we will verify validity of ALL of the args
1952   * before we start actual move.
1953   * 
1954   * This does not support ".inodes" relative path
1955   * @param target target to concat into
1956   * @param srcs file that will be concatenated
1957   * @throws IOException on error
1958   */
1959  void concat(String target, String [] srcs) 
1960      throws IOException, UnresolvedLinkException {
1961    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
1962    if (cacheEntry != null && cacheEntry.isSuccess()) {
1963      return; // Return previous response
1964    }
1965    
1966    // Either there is no previous request in progress or it has failed
1967    if(FSNamesystem.LOG.isDebugEnabled()) {
1968      FSNamesystem.LOG.debug("concat " + Arrays.toString(srcs) +
1969          " to " + target);
1970    }
1971    
1972    boolean success = false;
1973    try {
1974      concatInt(target, srcs, cacheEntry != null);
1975      success = true;
1976    } catch (AccessControlException e) {
1977      logAuditEvent(false, "concat", Arrays.toString(srcs), target, null);
1978      throw e;
1979    } finally {
1980      RetryCache.setState(cacheEntry, success);
1981    }
1982  }
1983
1984  private void concatInt(String target, String [] srcs, 
1985      boolean logRetryCache) throws IOException, UnresolvedLinkException {
1986    // verify args
1987    if(target.isEmpty()) {
1988      throw new IllegalArgumentException("Target file name is empty");
1989    }
1990    if(srcs == null || srcs.length == 0) {
1991      throw new IllegalArgumentException("No sources given");
1992    }
1993    
1994    // We require all files be in the same directory
1995    String trgParent = 
1996      target.substring(0, target.lastIndexOf(Path.SEPARATOR_CHAR));
1997    for (String s : srcs) {
1998      String srcParent = s.substring(0, s.lastIndexOf(Path.SEPARATOR_CHAR));
1999      if (!srcParent.equals(trgParent)) {
2000        throw new IllegalArgumentException(
2001           "Sources and target are not in the same directory");
2002      }
2003    }
2004
2005    HdfsFileStatus resultingStat = null;
2006    FSPermissionChecker pc = getPermissionChecker();
2007    waitForLoadingFSImage();
2008    writeLock();
2009    try {
2010      checkOperation(OperationCategory.WRITE);
2011      checkNameNodeSafeMode("Cannot concat " + target);
2012      concatInternal(pc, target, srcs, logRetryCache);
2013      resultingStat = getAuditFileInfo(target, false);
2014    } finally {
2015      writeUnlock();
2016    }
2017    getEditLog().logSync();
2018    logAuditEvent(true, "concat", Arrays.toString(srcs), target, resultingStat);
2019  }
2020
2021  /** See {@link #concat(String, String[])} */
2022  private void concatInternal(FSPermissionChecker pc, String target,
2023      String[] srcs, boolean logRetryCache) throws IOException,
2024      UnresolvedLinkException {
2025    assert hasWriteLock();
2026
2027    // write permission for the target
2028    if (isPermissionEnabled) {
2029      checkPathAccess(pc, target, FsAction.WRITE);
2030
2031      // and srcs
2032      for(String aSrc: srcs) {
2033        checkPathAccess(pc, aSrc, FsAction.READ); // read the file
2034        checkParentAccess(pc, aSrc, FsAction.WRITE); // for delete 
2035      }
2036    }
2037
2038    // to make sure no two files are the same
2039    Set<INode> si = new HashSet<INode>();
2040
2041    // we put the following prerequisite for the operation
2042    // replication and blocks sizes should be the same for ALL the blocks
2043
2044    // check the target
2045    final INodesInPath trgIip = dir.getINodesInPath4Write(target);
2046    if (dir.getEZForPath(trgIip) != null) {
2047      throw new HadoopIllegalArgumentException(
2048          "concat can not be called for files in an encryption zone.");
2049    }
2050    final INodeFile trgInode = INodeFile.valueOf(trgIip.getLastINode(),
2051        target);
2052    if(trgInode.isUnderConstruction()) {
2053      throw new HadoopIllegalArgumentException("concat: target file "
2054          + target + " is under construction");
2055    }
2056    // per design target shouldn't be empty and all the blocks same size
2057    if(trgInode.numBlocks() == 0) {
2058      throw new HadoopIllegalArgumentException("concat: target file "
2059          + target + " is empty");
2060    }
2061    if (trgInode.isWithSnapshot()) {
2062      throw new HadoopIllegalArgumentException("concat: target file "
2063          + target + " is in a snapshot");
2064    }
2065
2066    long blockSize = trgInode.getPreferredBlockSize();
2067
2068    // check the end block to be full
2069    final BlockInfo last = trgInode.getLastBlock();
2070    if(blockSize != last.getNumBytes()) {
2071      throw new HadoopIllegalArgumentException("The last block in " + target
2072          + " is not full; last block size = " + last.getNumBytes()
2073          + " but file block size = " + blockSize);
2074    }
2075
2076    si.add(trgInode);
2077    final short repl = trgInode.getFileReplication();
2078
2079    // now check the srcs
2080    boolean endSrc = false; // final src file doesn't have to have full end block
2081    for(int i=0; i<srcs.length; i++) {
2082      String src = srcs[i];
2083      if(i==srcs.length-1)
2084        endSrc=true;
2085
2086      final INodeFile srcInode = INodeFile.valueOf(dir.getINode4Write(src), src);
2087      if(src.isEmpty() 
2088          || srcInode.isUnderConstruction()
2089          || srcInode.numBlocks() == 0) {
2090        throw new HadoopIllegalArgumentException("concat: source file " + src
2091            + " is invalid or empty or underConstruction");
2092      }
2093
2094      // check replication and blocks size
2095      if(repl != srcInode.getBlockReplication()) {
2096        throw new HadoopIllegalArgumentException("concat: the source file "
2097            + src + " and the target file " + target
2098            + " should have the same replication: source replication is "
2099            + srcInode.getBlockReplication()
2100            + " but target replication is " + repl);
2101      }
2102
2103      //boolean endBlock=false;
2104      // verify that all the blocks are of the same length as target
2105      // should be enough to check the end blocks
2106      final BlockInfo[] srcBlocks = srcInode.getBlocks();
2107      int idx = srcBlocks.length-1;
2108      if(endSrc)
2109        idx = srcBlocks.length-2; // end block of endSrc is OK not to be full
2110      if(idx >= 0 && srcBlocks[idx].getNumBytes() != blockSize) {
2111        throw new HadoopIllegalArgumentException("concat: the source file "
2112            + src + " and the target file " + target
2113            + " should have the same blocks sizes: target block size is "
2114            + blockSize + " but the size of source block " + idx + " is "
2115            + srcBlocks[idx].getNumBytes());
2116      }
2117
2118      si.add(srcInode);
2119    }
2120
2121    // make sure no two files are the same
2122    if(si.size() < srcs.length+1) { // trg + srcs
2123      // it means at least two files are the same
2124      throw new HadoopIllegalArgumentException(
2125          "concat: at least two of the source files are the same");
2126    }
2127
2128    if(NameNode.stateChangeLog.isDebugEnabled()) {
2129      NameNode.stateChangeLog.debug("DIR* NameSystem.concat: " + 
2130          Arrays.toString(srcs) + " to " + target);
2131    }
2132
2133    long timestamp = now();
2134    dir.concat(target, srcs, timestamp);
2135    getEditLog().logConcat(target, srcs, timestamp, logRetryCache);
2136  }
2137  
2138  /**
2139   * stores the modification and access time for this inode. 
2140   * The access time is precise up to an hour. The transaction, if needed, is
2141   * written to the edits log but is not flushed.
2142   */
2143  void setTimes(String src, long mtime, long atime) 
2144      throws IOException, UnresolvedLinkException {
2145    if (!isAccessTimeSupported() && atime != -1) {
2146      throw new IOException("Access time for hdfs is not configured. " +
2147                            " Please set " + DFS_NAMENODE_ACCESSTIME_PRECISION_KEY + " configuration parameter.");
2148    }
2149    try {
2150      setTimesInt(src, mtime, atime);
2151    } catch (AccessControlException e) {
2152      logAuditEvent(false, "setTimes", src);
2153      throw e;
2154    }
2155  }
2156
2157  private void setTimesInt(final String srcArg, long mtime, long atime)
2158    throws IOException, UnresolvedLinkException {
2159    String src = srcArg;
2160    HdfsFileStatus resultingStat = null;
2161    FSPermissionChecker pc = getPermissionChecker();
2162    checkOperation(OperationCategory.WRITE);
2163    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2164    writeLock();
2165    try {
2166      checkOperation(OperationCategory.WRITE);
2167      checkNameNodeSafeMode("Cannot set times " + src);
2168      src = resolvePath(src, pathComponents);
2169
2170      // Write access is required to set access and modification times
2171      if (isPermissionEnabled) {
2172        checkPathAccess(pc, src, FsAction.WRITE);
2173      }
2174      final INodesInPath iip = dir.getINodesInPath4Write(src);
2175      final INode inode = iip.getLastINode();
2176      if (inode != null) {
2177        boolean changed = dir.setTimes(inode, mtime, atime, true,
2178                iip.getLatestSnapshotId());
2179        if (changed) {
2180          getEditLog().logTimes(src, mtime, atime);
2181        }
2182        resultingStat = getAuditFileInfo(src, false);
2183      } else {
2184        throw new FileNotFoundException("File/Directory " + src + " does not exist.");
2185      }
2186    } finally {
2187      writeUnlock();
2188    }
2189    logAuditEvent(true, "setTimes", srcArg, null, resultingStat);
2190  }
2191
2192  /**
2193   * Create a symbolic link.
2194   */
2195  @SuppressWarnings("deprecation")
2196  void createSymlink(String target, String link,
2197      PermissionStatus dirPerms, boolean createParent) 
2198      throws IOException, UnresolvedLinkException {
2199    if (!FileSystem.areSymlinksEnabled()) {
2200      throw new UnsupportedOperationException("Symlinks not supported");
2201    }
2202    if (!DFSUtil.isValidName(link)) {
2203      throw new InvalidPathException("Invalid link name: " + link);
2204    }
2205    if (FSDirectory.isReservedName(target)) {
2206      throw new InvalidPathException("Invalid target name: " + target);
2207    }
2208    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
2209    if (cacheEntry != null && cacheEntry.isSuccess()) {
2210      return; // Return previous response
2211    }
2212    boolean success = false;
2213    try {
2214      createSymlinkInt(target, link, dirPerms, createParent, cacheEntry != null);
2215      success = true;
2216    } catch (AccessControlException e) {
2217      logAuditEvent(false, "createSymlink", link, target, null);
2218      throw e;
2219    } finally {
2220      RetryCache.setState(cacheEntry, success);
2221    }
2222  }
2223
2224  private void createSymlinkInt(String target, final String linkArg,
2225      PermissionStatus dirPerms, boolean createParent, boolean logRetryCache) 
2226      throws IOException, UnresolvedLinkException {
2227    String link = linkArg;
2228    if (NameNode.stateChangeLog.isDebugEnabled()) {
2229      NameNode.stateChangeLog.debug("DIR* NameSystem.createSymlink: target="
2230          + target + " link=" + link);
2231    }
2232    HdfsFileStatus resultingStat = null;
2233    FSPermissionChecker pc = getPermissionChecker();
2234    checkOperation(OperationCategory.WRITE);
2235    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(link);
2236    writeLock();
2237    try {
2238      checkOperation(OperationCategory.WRITE);
2239      checkNameNodeSafeMode("Cannot create symlink " + link);
2240      link = resolvePath(link, pathComponents);
2241      if (!createParent) {
2242        verifyParentDir(link);
2243      }
2244      if (!dir.isValidToCreate(link)) {
2245        throw new IOException("failed to create link " + link 
2246            +" either because the filename is invalid or the file exists");
2247      }
2248      if (isPermissionEnabled) {
2249        checkAncestorAccess(pc, link, FsAction.WRITE);
2250      }
2251      // validate that we have enough inodes.
2252      checkFsObjectLimit();
2253
2254      // add symbolic link to namespace
2255      addSymlink(link, target, dirPerms, createParent, logRetryCache);
2256      resultingStat = getAuditFileInfo(link, false);
2257    } finally {
2258      writeUnlock();
2259    }
2260    getEditLog().logSync();
2261    logAuditEvent(true, "createSymlink", linkArg, target, resultingStat);
2262  }
2263
2264  /**
2265   * Set replication for an existing file.
2266   * 
2267   * The NameNode sets new replication and schedules either replication of 
2268   * under-replicated data blocks or removal of the excessive block copies 
2269   * if the blocks are over-replicated.
2270   * 
2271   * @see ClientProtocol#setReplication(String, short)
2272   * @param src file name
2273   * @param replication new replication
2274   * @return true if successful; 
2275   *         false if file does not exist or is a directory
2276   */
2277  boolean setReplication(final String src, final short replication)
2278      throws IOException {
2279    try {
2280      return setReplicationInt(src, replication);
2281    } catch (AccessControlException e) {
2282      logAuditEvent(false, "setReplication", src);
2283      throw e;
2284    }
2285  }
2286
2287  private boolean setReplicationInt(final String srcArg,
2288      final short replication) throws IOException {
2289    String src = srcArg;
2290    blockManager.verifyReplication(src, replication, null);
2291    final boolean isFile;
2292    FSPermissionChecker pc = getPermissionChecker();
2293    checkOperation(OperationCategory.WRITE);
2294    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2295    waitForLoadingFSImage();
2296    writeLock();
2297    try {
2298      checkOperation(OperationCategory.WRITE);
2299      checkNameNodeSafeMode("Cannot set replication for " + src);
2300      src = resolvePath(src, pathComponents);
2301      if (isPermissionEnabled) {
2302        checkPathAccess(pc, src, FsAction.WRITE);
2303      }
2304
2305      final short[] blockRepls = new short[2]; // 0: old, 1: new
2306      final Block[] blocks = dir.setReplication(src, replication, blockRepls);
2307      isFile = blocks != null;
2308      if (isFile) {
2309        getEditLog().logSetReplication(src, replication);
2310        blockManager.setReplication(blockRepls[0], blockRepls[1], src, blocks);
2311      }
2312    } finally {
2313      writeUnlock();
2314    }
2315
2316    getEditLog().logSync();
2317    if (isFile) {
2318      logAuditEvent(true, "setReplication", srcArg);
2319    }
2320    return isFile;
2321  }
2322
2323  /**
2324   * Set the storage policy for a file or a directory.
2325   *
2326   * @param src file/directory path
2327   * @param policyName storage policy name
2328   */
2329  void setStoragePolicy(String src, final String policyName)
2330      throws IOException {
2331    try {
2332      setStoragePolicyInt(src, policyName);
2333    } catch (AccessControlException e) {
2334      logAuditEvent(false, "setStoragePolicy", src);
2335      throw e;
2336    }
2337  }
2338
2339  private void setStoragePolicyInt(String src, final String policyName)
2340      throws IOException, UnresolvedLinkException, AccessControlException {
2341
2342    if (!isStoragePolicyEnabled) {
2343      throw new IOException("Failed to set storage policy since "
2344          + DFS_STORAGE_POLICY_ENABLED_KEY + " is set to false.");
2345    }
2346    FSPermissionChecker pc = null;
2347    if (isPermissionEnabled) {
2348      pc = getPermissionChecker();
2349    }
2350
2351    checkOperation(OperationCategory.WRITE);
2352    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2353    waitForLoadingFSImage();
2354    HdfsFileStatus fileStat;
2355    writeLock();
2356    try {
2357      checkOperation(OperationCategory.WRITE);
2358      checkNameNodeSafeMode("Cannot set storage policy for " + src);
2359
2360      if (pc != null) {
2361        checkPermission(pc, src, false, null, null, FsAction.WRITE, null,
2362                        false, true);
2363      }
2364
2365      src = FSDirectory.resolvePath(src, pathComponents, dir);
2366
2367      // get the corresponding policy and make sure the policy name is valid
2368      BlockStoragePolicy policy = blockManager.getStoragePolicy(policyName);
2369      if (policy == null) {
2370        throw new HadoopIllegalArgumentException(
2371            "Cannot find a block policy with the name " + policyName);
2372      }
2373      dir.setStoragePolicy(src, policy.getId());
2374      getEditLog().logSetStoragePolicy(src, policy.getId());
2375      fileStat = getAuditFileInfo(src, false);
2376    } finally {
2377      writeUnlock();
2378    }
2379
2380    getEditLog().logSync();
2381    logAuditEvent(true, "setStoragePolicy", src, null, fileStat);
2382  }
2383
2384  /**
2385   * @return All the existing block storage policies
2386   */
2387  BlockStoragePolicy[] getStoragePolicies() throws IOException {
2388    checkOperation(OperationCategory.READ);
2389    waitForLoadingFSImage();
2390    readLock();
2391    try {
2392      checkOperation(OperationCategory.READ);
2393      return blockManager.getStoragePolicies();
2394    } finally {
2395      readUnlock();
2396    }
2397  }
2398
2399  long getPreferredBlockSize(String filename) 
2400      throws IOException, UnresolvedLinkException {
2401    FSPermissionChecker pc = getPermissionChecker();
2402    checkOperation(OperationCategory.READ);
2403    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(filename);
2404    readLock();
2405    try {
2406      checkOperation(OperationCategory.READ);
2407      filename = resolvePath(filename, pathComponents);
2408      if (isPermissionEnabled) {
2409        checkTraverse(pc, filename);
2410      }
2411      return dir.getPreferredBlockSize(filename);
2412    } finally {
2413      readUnlock();
2414    }
2415  }
2416
2417  /**
2418   * Verify that parent directory of src exists.
2419   */
2420  private void verifyParentDir(String src) throws FileNotFoundException,
2421      ParentNotDirectoryException, UnresolvedLinkException {
2422    assert hasReadLock();
2423    Path parent = new Path(src).getParent();
2424    if (parent != null) {
2425      final INode parentNode = dir.getINode(parent.toString());
2426      if (parentNode == null) {
2427        throw new FileNotFoundException("Parent directory doesn't exist: "
2428            + parent);
2429      } else if (!parentNode.isDirectory() && !parentNode.isSymlink()) {
2430        throw new ParentNotDirectoryException("Parent path is not a directory: "
2431            + parent);
2432      }
2433    }
2434  }
2435
2436  /**
2437   * If the file is within an encryption zone, select the appropriate 
2438   * CryptoProtocolVersion from the list provided by the client. Since the
2439   * client may be newer, we need to handle unknown versions.
2440   *
2441   * @param zone EncryptionZone of the file
2442   * @param supportedVersions List of supported protocol versions
2443   * @return chosen protocol version
2444   * @throws IOException
2445   */
2446  private CryptoProtocolVersion chooseProtocolVersion(EncryptionZone zone,
2447      CryptoProtocolVersion[] supportedVersions)
2448      throws UnknownCryptoProtocolVersionException, UnresolvedLinkException,
2449        SnapshotAccessControlException {
2450    Preconditions.checkNotNull(zone);
2451    Preconditions.checkNotNull(supportedVersions);
2452    // Right now, we only support a single protocol version,
2453    // so simply look for it in the list of provided options
2454    final CryptoProtocolVersion required = zone.getVersion();
2455
2456    for (CryptoProtocolVersion c : supportedVersions) {
2457      if (c.equals(CryptoProtocolVersion.UNKNOWN)) {
2458        if (LOG.isDebugEnabled()) {
2459          LOG.debug("Ignoring unknown CryptoProtocolVersion provided by " +
2460              "client: " + c.getUnknownValue());
2461        }
2462        continue;
2463      }
2464      if (c.equals(required)) {
2465        return c;
2466      }
2467    }
2468    throw new UnknownCryptoProtocolVersionException(
2469        "No crypto protocol versions provided by the client are supported."
2470            + " Client provided: " + Arrays.toString(supportedVersions)
2471            + " NameNode supports: " + Arrays.toString(CryptoProtocolVersion
2472            .values()));
2473  }
2474
2475  /**
2476   * Invoke KeyProvider APIs to generate an encrypted data encryption key for an
2477   * encryption zone. Should not be called with any locks held.
2478   *
2479   * @param ezKeyName key name of an encryption zone
2480   * @return New EDEK, or null if ezKeyName is null
2481   * @throws IOException
2482   */
2483  private EncryptedKeyVersion generateEncryptedDataEncryptionKey(String
2484      ezKeyName) throws IOException {
2485    if (ezKeyName == null) {
2486      return null;
2487    }
2488    EncryptedKeyVersion edek = null;
2489    try {
2490      edek = provider.generateEncryptedKey(ezKeyName);
2491    } catch (GeneralSecurityException e) {
2492      throw new IOException(e);
2493    }
2494    Preconditions.checkNotNull(edek);
2495    return edek;
2496  }
2497
2498  /**
2499   * Create a new file entry in the namespace.
2500   * 
2501   * For description of parameters and exceptions thrown see
2502   * {@link ClientProtocol#create}, except it returns valid file status upon
2503   * success
2504   */
2505  HdfsFileStatus startFile(String src, PermissionStatus permissions,
2506      String holder, String clientMachine, EnumSet<CreateFlag> flag,
2507      boolean createParent, short replication, long blockSize, 
2508      CryptoProtocolVersion[] supportedVersions)
2509      throws AccessControlException, SafeModeException,
2510      FileAlreadyExistsException, UnresolvedLinkException,
2511      FileNotFoundException, ParentNotDirectoryException, IOException {
2512    HdfsFileStatus status = null;
2513    CacheEntryWithPayload cacheEntry = RetryCache.waitForCompletion(retryCache,
2514        null);
2515    if (cacheEntry != null && cacheEntry.isSuccess()) {
2516      return (HdfsFileStatus) cacheEntry.getPayload();
2517    }
2518    
2519    try {
2520      status = startFileInt(src, permissions, holder, clientMachine, flag,
2521          createParent, replication, blockSize, supportedVersions,
2522          cacheEntry != null);
2523    } catch (AccessControlException e) {
2524      logAuditEvent(false, "create", src);
2525      throw e;
2526    } finally {
2527      RetryCache.setState(cacheEntry, status != null, status);
2528    }
2529    return status;
2530  }
2531
2532  private HdfsFileStatus startFileInt(final String srcArg,
2533      PermissionStatus permissions, String holder, String clientMachine,
2534      EnumSet<CreateFlag> flag, boolean createParent, short replication,
2535      long blockSize, CryptoProtocolVersion[] supportedVersions,
2536      boolean logRetryCache)
2537      throws AccessControlException, SafeModeException,
2538      FileAlreadyExistsException, UnresolvedLinkException,
2539      FileNotFoundException, ParentNotDirectoryException, IOException {
2540    String src = srcArg;
2541    if (NameNode.stateChangeLog.isDebugEnabled()) {
2542      StringBuilder builder = new StringBuilder();
2543      builder.append("DIR* NameSystem.startFile: src=" + src
2544              + ", holder=" + holder
2545              + ", clientMachine=" + clientMachine
2546              + ", createParent=" + createParent
2547              + ", replication=" + replication
2548              + ", createFlag=" + flag.toString()
2549              + ", blockSize=" + blockSize);
2550      builder.append(", supportedVersions=");
2551      if (supportedVersions != null) {
2552        builder.append(Arrays.toString(supportedVersions));
2553      } else {
2554        builder.append("null");
2555      }
2556      NameNode.stateChangeLog.debug(builder.toString());
2557    }
2558    if (!DFSUtil.isValidName(src)) {
2559      throw new InvalidPathException(src);
2560    }
2561    blockManager.verifyReplication(src, replication, clientMachine);
2562
2563    boolean skipSync = false;
2564    HdfsFileStatus stat = null;
2565    FSPermissionChecker pc = getPermissionChecker();
2566    if (blockSize < minBlockSize) {
2567      throw new IOException("Specified block size is less than configured" +
2568          " minimum value (" + DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_KEY
2569          + "): " + blockSize + " < " + minBlockSize);
2570    }
2571    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2572    boolean create = flag.contains(CreateFlag.CREATE);
2573    boolean overwrite = flag.contains(CreateFlag.OVERWRITE);
2574    boolean isLazyPersist = flag.contains(CreateFlag.LAZY_PERSIST);
2575
2576    waitForLoadingFSImage();
2577
2578    /**
2579     * If the file is in an encryption zone, we optimistically create an
2580     * EDEK for the file by calling out to the configured KeyProvider.
2581     * Since this typically involves doing an RPC, we take the readLock
2582     * initially, then drop it to do the RPC.
2583     * 
2584     * Since the path can flip-flop between being in an encryption zone and not
2585     * in the meantime, we need to recheck the preconditions when we retake the
2586     * lock to do the create. If the preconditions are not met, we throw a
2587     * special RetryStartFileException to ask the DFSClient to try the create
2588     * again later.
2589     */
2590    CryptoProtocolVersion protocolVersion = null;
2591    CipherSuite suite = null;
2592    String ezKeyName = null;
2593    readLock();
2594    try {
2595      src = resolvePath(src, pathComponents);
2596      INodesInPath iip = dir.getINodesInPath4Write(src);
2597      // Nothing to do if the path is not within an EZ
2598      if (dir.isInAnEZ(iip)) {
2599        EncryptionZone zone = dir.getEZForPath(iip);
2600        protocolVersion = chooseProtocolVersion(zone, supportedVersions);
2601        suite = zone.getSuite();
2602        ezKeyName = dir.getKeyName(iip);
2603
2604        Preconditions.checkNotNull(protocolVersion);
2605        Preconditions.checkNotNull(suite);
2606        Preconditions.checkArgument(!suite.equals(CipherSuite.UNKNOWN),
2607            "Chose an UNKNOWN CipherSuite!");
2608        Preconditions.checkNotNull(ezKeyName);
2609      }
2610    } finally {
2611      readUnlock();
2612    }
2613
2614    Preconditions.checkState(
2615        (suite == null && ezKeyName == null) ||
2616            (suite != null && ezKeyName != null),
2617        "Both suite and ezKeyName should both be null or not null");
2618
2619    // Generate EDEK if necessary while not holding the lock
2620    EncryptedKeyVersion edek =
2621        generateEncryptedDataEncryptionKey(ezKeyName);
2622    EncryptionFaultInjector.getInstance().startFileAfterGenerateKey();
2623
2624    // Proceed with the create, using the computed cipher suite and 
2625    // generated EDEK
2626    BlocksMapUpdateInfo toRemoveBlocks = null;
2627    writeLock();
2628    try {
2629      checkOperation(OperationCategory.WRITE);
2630      checkNameNodeSafeMode("Cannot create file" + src);
2631      src = resolvePath(src, pathComponents);
2632      toRemoveBlocks = startFileInternal(pc, src, permissions, holder, 
2633          clientMachine, create, overwrite, createParent, replication, 
2634          blockSize, isLazyPersist, suite, protocolVersion, edek, logRetryCache);
2635      stat = dir.getFileInfo(src, false,
2636          FSDirectory.isReservedRawName(srcArg), true);
2637    } catch (StandbyException se) {
2638      skipSync = true;
2639      throw se;
2640    } finally {
2641      writeUnlock();
2642      // There might be transactions logged while trying to recover the lease.
2643      // They need to be sync'ed even when an exception was thrown.
2644      if (!skipSync) {
2645        getEditLog().logSync();
2646        if (toRemoveBlocks != null) {
2647          removeBlocks(toRemoveBlocks);
2648          toRemoveBlocks.clear();
2649        }
2650      }
2651    }
2652
2653    logAuditEvent(true, "create", srcArg, null, stat);
2654    return stat;
2655  }
2656
2657  /**
2658   * Create a new file or overwrite an existing file<br>
2659   * 
2660   * Once the file is create the client then allocates a new block with the next
2661   * call using {@link ClientProtocol#addBlock}.
2662   * <p>
2663   * For description of parameters and exceptions thrown see
2664   * {@link ClientProtocol#create}
2665   */
2666  private BlocksMapUpdateInfo startFileInternal(FSPermissionChecker pc, 
2667      String src, PermissionStatus permissions, String holder, 
2668      String clientMachine, boolean create, boolean overwrite, 
2669      boolean createParent, short replication, long blockSize, 
2670      boolean isLazyPersist, CipherSuite suite, CryptoProtocolVersion version,
2671      EncryptedKeyVersion edek, boolean logRetryEntry)
2672      throws FileAlreadyExistsException, AccessControlException,
2673      UnresolvedLinkException, FileNotFoundException,
2674      ParentNotDirectoryException, RetryStartFileException, IOException {
2675    assert hasWriteLock();
2676    // Verify that the destination does not exist as a directory already.
2677    final INodesInPath iip = dir.getINodesInPath4Write(src);
2678    final INode inode = iip.getLastINode();
2679    if (inode != null && inode.isDirectory()) {
2680      throw new FileAlreadyExistsException(src +
2681          " already exists as a directory");
2682    }
2683
2684    FileEncryptionInfo feInfo = null;
2685    if (dir.isInAnEZ(iip)) {
2686      // The path is now within an EZ, but we're missing encryption parameters
2687      if (suite == null || edek == null) {
2688        throw new RetryStartFileException();
2689      }
2690      // Path is within an EZ and we have provided encryption parameters.
2691      // Make sure that the generated EDEK matches the settings of the EZ.
2692      String ezKeyName = dir.getKeyName(iip);
2693      if (!ezKeyName.equals(edek.getEncryptionKeyName())) {
2694        throw new RetryStartFileException();
2695      }
2696      feInfo = new FileEncryptionInfo(suite, version,
2697          edek.getEncryptedKeyVersion().getMaterial(),
2698          edek.getEncryptedKeyIv(),
2699          ezKeyName, edek.getEncryptionKeyVersionName());
2700      Preconditions.checkNotNull(feInfo);
2701    }
2702
2703    final INodeFile myFile = INodeFile.valueOf(inode, src, true);
2704    if (isPermissionEnabled) {
2705      if (overwrite && myFile != null) {
2706        checkPathAccess(pc, src, FsAction.WRITE);
2707      }
2708      /*
2709       * To overwrite existing file, need to check 'w' permission 
2710       * of parent (equals to ancestor in this case)
2711       */
2712      checkAncestorAccess(pc, src, FsAction.WRITE);
2713    }
2714
2715    if (!createParent) {
2716      verifyParentDir(src);
2717    }
2718
2719    try {
2720      BlocksMapUpdateInfo toRemoveBlocks = null;
2721      if (myFile == null) {
2722        if (!create) {
2723          throw new FileNotFoundException("Can't overwrite non-existent " +
2724              src + " for client " + clientMachine);
2725        }
2726      } else {
2727        if (overwrite) {
2728          toRemoveBlocks = new BlocksMapUpdateInfo();
2729          List<INode> toRemoveINodes = new ChunkedArrayList<INode>();
2730          long ret = dir.delete(src, toRemoveBlocks, toRemoveINodes, now());
2731          if (ret >= 0) {
2732            incrDeletedFileCount(ret);
2733            removePathAndBlocks(src, null, toRemoveINodes, true);
2734          }
2735        } else {
2736          // If lease soft limit time is expired, recover the lease
2737          recoverLeaseInternal(myFile, src, holder, clientMachine, false);
2738          throw new FileAlreadyExistsException(src + " for client " +
2739              clientMachine + " already exists");
2740        }
2741      }
2742
2743      checkFsObjectLimit();
2744      INodeFile newNode = null;
2745
2746      // Always do an implicit mkdirs for parent directory tree.
2747      Path parent = new Path(src).getParent();
2748      if (parent != null && mkdirsRecursively(parent.toString(),
2749              permissions, true, now())) {
2750        newNode = dir.addFile(src, permissions, replication, blockSize,
2751                              holder, clientMachine);
2752      }
2753
2754      if (newNode == null) {
2755        throw new IOException("Unable to add " + src +  " to namespace");
2756      }
2757      leaseManager.addLease(newNode.getFileUnderConstructionFeature()
2758          .getClientName(), src);
2759
2760      // Set encryption attributes if necessary
2761      if (feInfo != null) {
2762        dir.setFileEncryptionInfo(src, feInfo);
2763        newNode = dir.getInode(newNode.getId()).asFile();
2764      }
2765
2766      setNewINodeStoragePolicy(newNode, iip, isLazyPersist);
2767
2768      // record file record in log, record new generation stamp
2769      getEditLog().logOpenFile(src, newNode, overwrite, logRetryEntry);
2770      if (NameNode.stateChangeLog.isDebugEnabled()) {
2771        NameNode.stateChangeLog.debug("DIR* NameSystem.startFile: added " +
2772            src + " inode " + newNode.getId() + " " + holder);
2773      }
2774      return toRemoveBlocks;
2775    } catch (IOException ie) {
2776      NameNode.stateChangeLog.warn("DIR* NameSystem.startFile: " + src + " " +
2777          ie.getMessage());
2778      throw ie;
2779    }
2780  }
2781
2782  private void setNewINodeStoragePolicy(INodeFile inode,
2783                                        INodesInPath iip,
2784                                        boolean isLazyPersist)
2785      throws IOException {
2786
2787    if (isLazyPersist) {
2788      BlockStoragePolicy lpPolicy =
2789          blockManager.getStoragePolicy("LAZY_PERSIST");
2790
2791      // Set LAZY_PERSIST storage policy if the flag was passed to
2792      // CreateFile.
2793      if (lpPolicy == null) {
2794        throw new HadoopIllegalArgumentException(
2795            "The LAZY_PERSIST storage policy has been disabled " +
2796            "by the administrator.");
2797      }
2798      inode.setStoragePolicyID(lpPolicy.getId(),
2799                                 iip.getLatestSnapshotId());
2800    } else {
2801      BlockStoragePolicy effectivePolicy =
2802          blockManager.getStoragePolicy(inode.getStoragePolicyID());
2803
2804      if (effectivePolicy != null &&
2805          effectivePolicy.isCopyOnCreateFile()) {
2806        // Copy effective policy from ancestor directory to current file.
2807        inode.setStoragePolicyID(effectivePolicy.getId(),
2808                                 iip.getLatestSnapshotId());
2809      }
2810    }
2811  }
2812
2813  /**
2814   * Append to an existing file for append.
2815   * <p>
2816   * 
2817   * The method returns the last block of the file if this is a partial block,
2818   * which can still be used for writing more data. The client uses the returned
2819   * block locations to form the data pipeline for this block.<br>
2820   * The method returns null if the last block is full. The client then
2821   * allocates a new block with the next call using
2822   * {@link ClientProtocol#addBlock}.
2823   * <p>
2824   * 
2825   * For description of parameters and exceptions thrown see
2826   * {@link ClientProtocol#append(String, String)}
2827   * 
2828   * @return the last block locations if the block is partial or null otherwise
2829   */
2830  private LocatedBlock appendFileInternal(FSPermissionChecker pc, String src,
2831      String holder, String clientMachine, boolean logRetryCache)
2832      throws AccessControlException, UnresolvedLinkException,
2833      FileNotFoundException, IOException {
2834    assert hasWriteLock();
2835    // Verify that the destination does not exist as a directory already.
2836    final INodesInPath iip = dir.getINodesInPath4Write(src);
2837    final INode inode = iip.getLastINode();
2838    if (inode != null && inode.isDirectory()) {
2839      throw new FileAlreadyExistsException("Cannot append to directory " + src
2840          + "; already exists as a directory.");
2841    }
2842    if (isPermissionEnabled) {
2843      checkPathAccess(pc, src, FsAction.WRITE);
2844    }
2845
2846    try {
2847      if (inode == null) {
2848        throw new FileNotFoundException("failed to append to non-existent file "
2849          + src + " for client " + clientMachine);
2850      }
2851      INodeFile myFile = INodeFile.valueOf(inode, src, true);
2852      final BlockStoragePolicy lpPolicy =
2853          blockManager.getStoragePolicy("LAZY_PERSIST");
2854
2855      if (lpPolicy != null &&
2856          lpPolicy.getId() == myFile.getStoragePolicyID()) {
2857        throw new UnsupportedOperationException(
2858            "Cannot append to lazy persist file " + src);
2859      }
2860      // Opening an existing file for write - may need to recover lease.
2861      recoverLeaseInternal(myFile, src, holder, clientMachine, false);
2862      
2863      // recoverLeaseInternal may create a new InodeFile via 
2864      // finalizeINodeFileUnderConstruction so we need to refresh 
2865      // the referenced file.  
2866      myFile = INodeFile.valueOf(dir.getINode(src), src, true);
2867      final BlockInfo lastBlock = myFile.getLastBlock();
2868      // Check that the block has at least minimum replication.
2869      if(lastBlock != null && lastBlock.isComplete() &&
2870          !getBlockManager().isSufficientlyReplicated(lastBlock)) {
2871        throw new IOException("append: lastBlock=" + lastBlock +
2872            " of src=" + src + " is not sufficiently replicated yet.");
2873      }
2874      return prepareFileForWrite(src, iip, holder, clientMachine, true,
2875          logRetryCache);
2876    } catch (IOException ie) {
2877      NameNode.stateChangeLog.warn("DIR* NameSystem.append: " +ie.getMessage());
2878      throw ie;
2879    }
2880  }
2881  
2882  /**
2883   * Replace current node with a INodeUnderConstruction.
2884   * Recreate in-memory lease record.
2885   * 
2886   * @param src path to the file
2887   * @param file existing file object
2888   * @param leaseHolder identifier of the lease holder on this file
2889   * @param clientMachine identifier of the client machine
2890   * @param writeToEditLog whether to persist this change to the edit log
2891   * @param logRetryCache whether to record RPC ids in editlog for retry cache
2892   *                      rebuilding
2893   * @return the last block locations if the block is partial or null otherwise
2894   * @throws UnresolvedLinkException
2895   * @throws IOException
2896   */
2897  LocatedBlock prepareFileForWrite(String src, INodesInPath iip,
2898      String leaseHolder, String clientMachine, boolean writeToEditLog,
2899      boolean logRetryCache) throws IOException {
2900    final INodeFile file = iip.getLastINode().asFile();
2901    final Quota.Counts delta = verifyQuotaForUCBlock(file, iip);
2902
2903    file.recordModification(iip.getLatestSnapshotId());
2904    file.toUnderConstruction(leaseHolder, clientMachine);
2905
2906    leaseManager.addLease(
2907        file.getFileUnderConstructionFeature().getClientName(), src);
2908
2909    LocatedBlock ret = blockManager.convertLastBlockToUnderConstruction(file);
2910    if (ret != null && delta != null) {
2911      Preconditions.checkState(delta.get(Quota.DISKSPACE) >= 0,
2912          "appending to a block with size larger than the preferred block size");
2913      dir.writeLock();
2914      try {
2915        dir.updateCountNoQuotaCheck(iip, iip.length() - 1,
2916            delta.get(Quota.NAMESPACE), delta.get(Quota.DISKSPACE));
2917      } finally {
2918        dir.writeUnlock();
2919      }
2920    }
2921
2922    if (writeToEditLog) {
2923      getEditLog().logOpenFile(src, file, false, logRetryCache);
2924    }
2925    return ret;
2926  }
2927
2928  /**
2929   * Verify quota when using the preferred block size for UC block. This is
2930   * usually used by append and truncate
2931   * @throws QuotaExceededException when violating the storage quota
2932   * @return expected quota usage update. null means no change or no need to
2933   *         update quota usage later
2934   */
2935  private Quota.Counts verifyQuotaForUCBlock(INodeFile file, INodesInPath iip)
2936      throws QuotaExceededException {
2937    if (!isImageLoaded() || dir.shouldSkipQuotaChecks()) {
2938      // Do not check quota if editlog is still being processed
2939      return null;
2940    }
2941    if (file.getLastBlock() != null) {
2942      final Quota.Counts delta = computeQuotaDeltaForUCBlock(file);
2943      dir.readLock();
2944      try {
2945        FSDirectory.verifyQuota(iip.getINodes(), iip.length() - 1,
2946            delta.get(Quota.NAMESPACE), delta.get(Quota.DISKSPACE), null);
2947        return delta;
2948      } finally {
2949        dir.readUnlock();
2950      }
2951    }
2952    return null;
2953  }
2954
2955  /** Compute quota change for converting a complete block to a UC block */
2956  private Quota.Counts computeQuotaDeltaForUCBlock(INodeFile file) {
2957    final BlockInfo lastBlock = file.getLastBlock();
2958    if (lastBlock != null) {
2959      final long diff = file.getPreferredBlockSize() - lastBlock.getNumBytes();
2960      final short repl = file.getBlockReplication();
2961      return Quota.Counts.newInstance(0, diff * repl);
2962    } else {
2963      return Quota.Counts.newInstance();
2964    }
2965  }
2966
2967  /**
2968   * Recover lease;
2969   * Immediately revoke the lease of the current lease holder and start lease
2970   * recovery so that the file can be forced to be closed.
2971   * 
2972   * @param src the path of the file to start lease recovery
2973   * @param holder the lease holder's name
2974   * @param clientMachine the client machine's name
2975   * @return true if the file is already closed
2976   * @throws IOException
2977   */
2978  boolean recoverLease(String src, String holder, String clientMachine)
2979      throws IOException {
2980    if (!DFSUtil.isValidName(src)) {
2981      throw new IOException("Invalid file name: " + src);
2982    }
2983  
2984    boolean skipSync = false;
2985    FSPermissionChecker pc = getPermissionChecker();
2986    checkOperation(OperationCategory.WRITE);
2987    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2988    writeLock();
2989    try {
2990      checkOperation(OperationCategory.WRITE);
2991      checkNameNodeSafeMode("Cannot recover the lease of " + src);
2992      src = resolvePath(src, pathComponents);
2993      final INodeFile inode = INodeFile.valueOf(dir.getINode(src), src);
2994      if (!inode.isUnderConstruction()) {
2995        return true;
2996      }
2997      if (isPermissionEnabled) {
2998        checkPathAccess(pc, src, FsAction.WRITE);
2999      }
3000  
3001      recoverLeaseInternal(inode, src, holder, clientMachine, true);
3002    } catch (StandbyException se) {
3003      skipSync = true;
3004      throw se;
3005    } finally {
3006      writeUnlock();
3007      // There might be transactions logged while trying to recover the lease.
3008      // They need to be sync'ed even when an exception was thrown.
3009      if (!skipSync) {
3010        getEditLog().logSync();
3011      }
3012    }
3013    return false;
3014  }
3015
3016  private void recoverLeaseInternal(INodeFile fileInode, 
3017      String src, String holder, String clientMachine, boolean force)
3018      throws IOException {
3019    assert hasWriteLock();
3020    if (fileInode != null && fileInode.isUnderConstruction()) {
3021      //
3022      // If the file is under construction , then it must be in our
3023      // leases. Find the appropriate lease record.
3024      //
3025      Lease lease = leaseManager.getLease(holder);
3026      //
3027      // We found the lease for this file. And surprisingly the original
3028      // holder is trying to recreate this file. This should never occur.
3029      //
3030
3031      if (!force && lease != null) {
3032        Lease leaseFile = leaseManager.getLeaseByPath(src);
3033        if (leaseFile != null && leaseFile.equals(lease)) {
3034          throw new AlreadyBeingCreatedException(
3035            "failed to create file " + src + " for " + holder +
3036            " for client " + clientMachine +
3037            " because current leaseholder is trying to recreate file.");
3038        }
3039      }
3040      //
3041      // Find the original holder.
3042      //
3043      FileUnderConstructionFeature uc = fileInode.getFileUnderConstructionFeature();
3044      String clientName = uc.getClientName();
3045      lease = leaseManager.getLease(clientName);
3046      if (lease == null) {
3047        throw new AlreadyBeingCreatedException(
3048          "failed to create file " + src + " for " + holder +
3049          " for client " + clientMachine +
3050          " because pendingCreates is non-null but no leases found.");
3051      }
3052      if (force) {
3053        // close now: no need to wait for soft lease expiration and 
3054        // close only the file src
3055        LOG.info("recoverLease: " + lease + ", src=" + src +
3056          " from client " + clientName);
3057        internalReleaseLease(lease, src, holder);
3058      } else {
3059        assert lease.getHolder().equals(clientName) :
3060          "Current lease holder " + lease.getHolder() +
3061          " does not match file creator " + clientName;
3062        //
3063        // If the original holder has not renewed in the last SOFTLIMIT 
3064        // period, then start lease recovery.
3065        //
3066        if (lease.expiredSoftLimit()) {
3067          LOG.info("startFile: recover " + lease + ", src=" + src + " client "
3068              + clientName);
3069          boolean isClosed = internalReleaseLease(lease, src, null);
3070          if(!isClosed)
3071            throw new RecoveryInProgressException(
3072                "Failed to close file " + src +
3073                ". Lease recovery is in progress. Try again later.");
3074        } else {
3075          final BlockInfo lastBlock = fileInode.getLastBlock();
3076          if (lastBlock != null
3077              && lastBlock.getBlockUCState() == BlockUCState.UNDER_RECOVERY) {
3078            throw new RecoveryInProgressException("Recovery in progress, file ["
3079                + src + "], " + "lease owner [" + lease.getHolder() + "]");
3080          } else {
3081            throw new AlreadyBeingCreatedException("Failed to create file ["
3082                + src + "] for [" + holder + "] for client [" + clientMachine
3083                + "], because this file is already being created by ["
3084                + clientName + "] on ["
3085                + uc.getClientMachine() + "]");
3086          }
3087        }
3088      }
3089    }
3090  }
3091
3092  /**
3093   * Append to an existing file in the namespace.
3094   */
3095  LocatedBlock appendFile(String src, String holder, String clientMachine)
3096      throws AccessControlException, SafeModeException,
3097      FileAlreadyExistsException, FileNotFoundException,
3098      ParentNotDirectoryException, IOException {
3099    LocatedBlock lb = null;
3100    CacheEntryWithPayload cacheEntry = RetryCache.waitForCompletion(retryCache,
3101        null);
3102    if (cacheEntry != null && cacheEntry.isSuccess()) {
3103      return (LocatedBlock) cacheEntry.getPayload();
3104    }
3105      
3106    boolean success = false;
3107    try {
3108      lb = appendFileInt(src, holder, clientMachine, cacheEntry != null);
3109      success = true;
3110      return lb;
3111    } catch (AccessControlException e) {
3112      logAuditEvent(false, "append", src);
3113      throw e;
3114    } finally {
3115      RetryCache.setState(cacheEntry, success, lb);
3116    }
3117  }
3118
3119  private LocatedBlock appendFileInt(final String srcArg, String holder,
3120      String clientMachine, boolean logRetryCache)
3121      throws AccessControlException, SafeModeException,
3122      FileAlreadyExistsException, FileNotFoundException,
3123      ParentNotDirectoryException, IOException {
3124    String src = srcArg;
3125    if (NameNode.stateChangeLog.isDebugEnabled()) {
3126      NameNode.stateChangeLog.debug("DIR* NameSystem.appendFile: src=" + src
3127          + ", holder=" + holder
3128          + ", clientMachine=" + clientMachine);
3129    }
3130    boolean skipSync = false;
3131    if (!supportAppends) {
3132      throw new UnsupportedOperationException(
3133          "Append is not enabled on this NameNode. Use the " +
3134          DFS_SUPPORT_APPEND_KEY + " configuration option to enable it.");
3135    }
3136
3137    LocatedBlock lb = null;
3138    FSPermissionChecker pc = getPermissionChecker();
3139    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3140    writeLock();
3141    try {
3142      checkOperation(OperationCategory.WRITE);
3143      checkNameNodeSafeMode("Cannot append to file" + src);
3144      src = resolvePath(src, pathComponents);
3145      lb = appendFileInternal(pc, src, holder, clientMachine, logRetryCache);
3146    } catch (StandbyException se) {
3147      skipSync = true;
3148      throw se;
3149    } finally {
3150      writeUnlock();
3151      // There might be transactions logged while trying to recover the lease.
3152      // They need to be sync'ed even when an exception was thrown.
3153      if (!skipSync) {
3154        getEditLog().logSync();
3155      }
3156    }
3157    if (lb != null) {
3158      if (NameNode.stateChangeLog.isDebugEnabled()) {
3159        NameNode.stateChangeLog.debug("DIR* NameSystem.appendFile: file "
3160            +src+" for "+holder+" at "+clientMachine
3161            +" block " + lb.getBlock()
3162            +" block size " + lb.getBlock().getNumBytes());
3163      }
3164    }
3165    logAuditEvent(true, "append", srcArg);
3166    return lb;
3167  }
3168
3169  ExtendedBlock getExtendedBlock(Block blk) {
3170    return new ExtendedBlock(blockPoolId, blk);
3171  }
3172  
3173  void setBlockPoolId(String bpid) {
3174    blockPoolId = bpid;
3175    blockManager.setBlockPoolId(blockPoolId);
3176  }
3177
3178  /**
3179   * The client would like to obtain an additional block for the indicated
3180   * filename (which is being written-to).  Return an array that consists
3181   * of the block, plus a set of machines.  The first on this list should
3182   * be where the client writes data.  Subsequent items in the list must
3183   * be provided in the connection to the first datanode.
3184   *
3185   * Make sure the previous blocks have been reported by datanodes and
3186   * are replicated.  Will return an empty 2-elt array if we want the
3187   * client to "try again later".
3188   */
3189  LocatedBlock getAdditionalBlock(String src, long fileId, String clientName,
3190      ExtendedBlock previous, Set<Node> excludedNodes, 
3191      List<String> favoredNodes)
3192      throws LeaseExpiredException, NotReplicatedYetException,
3193      QuotaExceededException, SafeModeException, UnresolvedLinkException,
3194      IOException {
3195    final long blockSize;
3196    final int replication;
3197    final byte storagePolicyID;
3198    Node clientNode = null;
3199    String clientMachine = null;
3200
3201    if(NameNode.stateChangeLog.isDebugEnabled()) {
3202      NameNode.stateChangeLog.debug("BLOCK* NameSystem.getAdditionalBlock: "
3203          + src + " inodeId " +  fileId  + " for " + clientName);
3204    }
3205
3206    // Part I. Analyze the state of the file with respect to the input data.
3207    checkOperation(OperationCategory.READ);
3208    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3209    readLock();
3210    try {
3211      checkOperation(OperationCategory.READ);
3212      src = resolvePath(src, pathComponents);
3213      LocatedBlock[] onRetryBlock = new LocatedBlock[1];
3214      FileState fileState = analyzeFileState(
3215          src, fileId, clientName, previous, onRetryBlock);
3216      final INodeFile pendingFile = fileState.inode;
3217      src = fileState.path;
3218
3219      if (onRetryBlock[0] != null && onRetryBlock[0].getLocations().length > 0) {
3220        // This is a retry. Just return the last block if having locations.
3221        return onRetryBlock[0];
3222      }
3223      if (pendingFile.getBlocks().length >= maxBlocksPerFile) {
3224        throw new IOException("File has reached the limit on maximum number of"
3225            + " blocks (" + DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_KEY
3226            + "): " + pendingFile.getBlocks().length + " >= "
3227            + maxBlocksPerFile);
3228      }
3229      blockSize = pendingFile.getPreferredBlockSize();
3230      clientMachine = pendingFile.getFileUnderConstructionFeature()
3231          .getClientMachine();
3232      clientNode = blockManager.getDatanodeManager().getDatanodeByHost(
3233          clientMachine);
3234      replication = pendingFile.getFileReplication();
3235      storagePolicyID = pendingFile.getStoragePolicyID();
3236    } finally {
3237      readUnlock();
3238    }
3239
3240    if (clientNode == null) {
3241      clientNode = getClientNode(clientMachine);
3242    }
3243
3244    // choose targets for the new block to be allocated.
3245    final DatanodeStorageInfo targets[] = getBlockManager().chooseTarget4NewBlock( 
3246        src, replication, clientNode, excludedNodes, blockSize, favoredNodes,
3247        storagePolicyID);
3248
3249    // Part II.
3250    // Allocate a new block, add it to the INode and the BlocksMap. 
3251    Block newBlock = null;
3252    long offset;
3253    checkOperation(OperationCategory.WRITE);
3254    waitForLoadingFSImage();
3255    writeLock();
3256    try {
3257      checkOperation(OperationCategory.WRITE);
3258      // Run the full analysis again, since things could have changed
3259      // while chooseTarget() was executing.
3260      LocatedBlock[] onRetryBlock = new LocatedBlock[1];
3261      FileState fileState = 
3262          analyzeFileState(src, fileId, clientName, previous, onRetryBlock);
3263      final INodeFile pendingFile = fileState.inode;
3264      src = fileState.path;
3265
3266      if (onRetryBlock[0] != null) {
3267        if (onRetryBlock[0].getLocations().length > 0) {
3268          // This is a retry. Just return the last block if having locations.
3269          return onRetryBlock[0];
3270        } else {
3271          // add new chosen targets to already allocated block and return
3272          BlockInfo lastBlockInFile = pendingFile.getLastBlock();
3273          ((BlockInfoUnderConstruction) lastBlockInFile)
3274              .setExpectedLocations(targets);
3275          offset = pendingFile.computeFileSize();
3276          return makeLocatedBlock(lastBlockInFile, targets, offset);
3277        }
3278      }
3279
3280      // commit the last block and complete it if it has minimum replicas
3281      commitOrCompleteLastBlock(pendingFile,
3282                                ExtendedBlock.getLocalBlock(previous));
3283
3284      // allocate new block, record block locations in INode.
3285      newBlock = createNewBlock();
3286      INodesInPath inodesInPath = INodesInPath.fromINode(pendingFile);
3287      saveAllocatedBlock(src, inodesInPath, newBlock, targets);
3288
3289      persistNewBlock(src, pendingFile);
3290      offset = pendingFile.computeFileSize();
3291    } finally {
3292      writeUnlock();
3293    }
3294    getEditLog().logSync();
3295
3296    // Return located block
3297    return makeLocatedBlock(newBlock, targets, offset);
3298  }
3299
3300  /*
3301   * Resolve clientmachine address to get a network location path
3302   */
3303  private Node getClientNode(String clientMachine) {
3304    List<String> hosts = new ArrayList<String>(1);
3305    hosts.add(clientMachine);
3306    List<String> rName = getBlockManager().getDatanodeManager()
3307        .resolveNetworkLocation(hosts);
3308    Node clientNode = null;
3309    if (rName != null) {
3310      // Able to resolve clientMachine mapping.
3311      // Create a temp node to findout the rack local nodes
3312      clientNode = new NodeBase(rName.get(0) + NodeBase.PATH_SEPARATOR_STR
3313          + clientMachine);
3314    }
3315    return clientNode;
3316  }
3317
3318  static class FileState {
3319    public final INodeFile inode;
3320    public final String path;
3321
3322    public FileState(INodeFile inode, String fullPath) {
3323      this.inode = inode;
3324      this.path = fullPath;
3325    }
3326  }
3327
3328  FileState analyzeFileState(String src,
3329                                long fileId,
3330                                String clientName,
3331                                ExtendedBlock previous,
3332                                LocatedBlock[] onRetryBlock)
3333          throws IOException  {
3334    assert hasReadLock();
3335
3336    checkBlock(previous);
3337    onRetryBlock[0] = null;
3338    checkOperation(OperationCategory.WRITE);
3339    checkNameNodeSafeMode("Cannot add block to " + src);
3340
3341    // have we exceeded the configured limit of fs objects.
3342    checkFsObjectLimit();
3343
3344    Block previousBlock = ExtendedBlock.getLocalBlock(previous);
3345    INode inode;
3346    if (fileId == INodeId.GRANDFATHER_INODE_ID) {
3347      // Older clients may not have given us an inode ID to work with.
3348      // In this case, we have to try to resolve the path and hope it
3349      // hasn't changed or been deleted since the file was opened for write.
3350      final INodesInPath iip = dir.getINodesInPath4Write(src);
3351      inode = iip.getLastINode();
3352    } else {
3353      // Newer clients pass the inode ID, so we can just get the inode
3354      // directly.
3355      inode = dir.getInode(fileId);
3356      if (inode != null) src = inode.getFullPathName();
3357    }
3358    final INodeFile pendingFile = checkLease(src, clientName, inode, fileId);
3359    BlockInfo lastBlockInFile = pendingFile.getLastBlock();
3360    if (!Block.matchingIdAndGenStamp(previousBlock, lastBlockInFile)) {
3361      // The block that the client claims is the current last block
3362      // doesn't match up with what we think is the last block. There are
3363      // four possibilities:
3364      // 1) This is the first block allocation of an append() pipeline
3365      //    which started appending exactly at or exceeding the block boundary.
3366      //    In this case, the client isn't passed the previous block,
3367      //    so it makes the allocateBlock() call with previous=null.
3368      //    We can distinguish this since the last block of the file
3369      //    will be exactly a full block.
3370      // 2) This is a retry from a client that missed the response of a
3371      //    prior getAdditionalBlock() call, perhaps because of a network
3372      //    timeout, or because of an HA failover. In that case, we know
3373      //    by the fact that the client is re-issuing the RPC that it
3374      //    never began to write to the old block. Hence it is safe to
3375      //    to return the existing block.
3376      // 3) This is an entirely bogus request/bug -- we should error out
3377      //    rather than potentially appending a new block with an empty
3378      //    one in the middle, etc
3379      // 4) This is a retry from a client that timed out while
3380      //    the prior getAdditionalBlock() is still being processed,
3381      //    currently working on chooseTarget(). 
3382      //    There are no means to distinguish between the first and 
3383      //    the second attempts in Part I, because the first one hasn't
3384      //    changed the namesystem state yet.
3385      //    We run this analysis again in Part II where case 4 is impossible.
3386
3387      BlockInfo penultimateBlock = pendingFile.getPenultimateBlock();
3388      if (previous == null &&
3389          lastBlockInFile != null &&
3390          lastBlockInFile.getNumBytes() >= pendingFile.getPreferredBlockSize() &&
3391          lastBlockInFile.isComplete()) {
3392        // Case 1
3393        if (NameNode.stateChangeLog.isDebugEnabled()) {
3394           NameNode.stateChangeLog.debug(
3395               "BLOCK* NameSystem.allocateBlock: handling block allocation" +
3396               " writing to a file with a complete previous block: src=" +
3397               src + " lastBlock=" + lastBlockInFile);
3398        }
3399      } else if (Block.matchingIdAndGenStamp(penultimateBlock, previousBlock)) {
3400        if (lastBlockInFile.getNumBytes() != 0) {
3401          throw new IOException(
3402              "Request looked like a retry to allocate block " +
3403              lastBlockInFile + " but it already contains " +
3404              lastBlockInFile.getNumBytes() + " bytes");
3405        }
3406
3407        // Case 2
3408        // Return the last block.
3409        NameNode.stateChangeLog.info("BLOCK* allocateBlock: " +
3410            "caught retry for allocation of a new block in " +
3411            src + ". Returning previously allocated block " + lastBlockInFile);
3412        long offset = pendingFile.computeFileSize();
3413        onRetryBlock[0] = makeLocatedBlock(lastBlockInFile,
3414            ((BlockInfoUnderConstruction)lastBlockInFile).getExpectedStorageLocations(),
3415            offset);
3416        return new FileState(pendingFile, src);
3417      } else {
3418        // Case 3
3419        throw new IOException("Cannot allocate block in " + src + ": " +
3420            "passed 'previous' block " + previous + " does not match actual " +
3421            "last block in file " + lastBlockInFile);
3422      }
3423    }
3424
3425    // Check if the penultimate block is minimally replicated
3426    if (!checkFileProgress(pendingFile, false)) {
3427      throw new NotReplicatedYetException("Not replicated yet: " + src);
3428    }
3429    return new FileState(pendingFile, src);
3430  }
3431
3432  LocatedBlock makeLocatedBlock(Block blk, DatanodeStorageInfo[] locs,
3433                                        long offset) throws IOException {
3434    LocatedBlock lBlk = new LocatedBlock(
3435        getExtendedBlock(blk), locs, offset, false);
3436    getBlockManager().setBlockToken(
3437        lBlk, BlockTokenSecretManager.AccessMode.WRITE);
3438    return lBlk;
3439  }
3440
3441  /** @see ClientProtocol#getAdditionalDatanode */
3442  LocatedBlock getAdditionalDatanode(String src, long fileId,
3443      final ExtendedBlock blk, final DatanodeInfo[] existings,
3444      final String[] storageIDs,
3445      final Set<Node> excludes,
3446      final int numAdditionalNodes, final String clientName
3447      ) throws IOException {
3448    //check if the feature is enabled
3449    dtpReplaceDatanodeOnFailure.checkEnabled();
3450
3451    Node clientnode = null;
3452    String clientMachine;
3453    final long preferredblocksize;
3454    final byte storagePolicyID;
3455    final List<DatanodeStorageInfo> chosen;
3456    checkOperation(OperationCategory.READ);
3457    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3458    readLock();
3459    try {
3460      checkOperation(OperationCategory.READ);
3461      //check safe mode
3462      checkNameNodeSafeMode("Cannot add datanode; src=" + src + ", blk=" + blk);
3463      src = resolvePath(src, pathComponents);
3464
3465      //check lease
3466      final INode inode;
3467      if (fileId == INodeId.GRANDFATHER_INODE_ID) {
3468        // Older clients may not have given us an inode ID to work with.
3469        // In this case, we have to try to resolve the path and hope it
3470        // hasn't changed or been deleted since the file was opened for write.
3471        inode = dir.getINode(src);
3472      } else {
3473        inode = dir.getInode(fileId);
3474        if (inode != null) src = inode.getFullPathName();
3475      }
3476      final INodeFile file = checkLease(src, clientName, inode, fileId);
3477      clientMachine = file.getFileUnderConstructionFeature().getClientMachine();
3478      clientnode = blockManager.getDatanodeManager().getDatanodeByHost(clientMachine);
3479      preferredblocksize = file.getPreferredBlockSize();
3480      storagePolicyID = file.getStoragePolicyID();
3481
3482      //find datanode storages
3483      final DatanodeManager dm = blockManager.getDatanodeManager();
3484      chosen = Arrays.asList(dm.getDatanodeStorageInfos(existings, storageIDs));
3485    } finally {
3486      readUnlock();
3487    }
3488
3489    if (clientnode == null) {
3490      clientnode = getClientNode(clientMachine);
3491    }
3492
3493    // choose new datanodes.
3494    final DatanodeStorageInfo[] targets = blockManager.chooseTarget4AdditionalDatanode(
3495        src, numAdditionalNodes, clientnode, chosen, 
3496        excludes, preferredblocksize, storagePolicyID);
3497    final LocatedBlock lb = new LocatedBlock(blk, targets);
3498    blockManager.setBlockToken(lb, AccessMode.COPY);
3499    return lb;
3500  }
3501
3502  /**
3503   * The client would like to let go of the given block
3504   */
3505  boolean abandonBlock(ExtendedBlock b, long fileId, String src, String holder)
3506      throws LeaseExpiredException, FileNotFoundException,
3507      UnresolvedLinkException, IOException {
3508    if(NameNode.stateChangeLog.isDebugEnabled()) {
3509      NameNode.stateChangeLog.debug("BLOCK* NameSystem.abandonBlock: " + b
3510          + "of file " + src);
3511    }
3512    checkOperation(OperationCategory.WRITE);
3513    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3514    waitForLoadingFSImage();
3515    writeLock();
3516    try {
3517      checkOperation(OperationCategory.WRITE);
3518      checkNameNodeSafeMode("Cannot abandon block " + b + " for file" + src);
3519      src = resolvePath(src, pathComponents);
3520
3521      final INode inode;
3522      if (fileId == INodeId.GRANDFATHER_INODE_ID) {
3523        // Older clients may not have given us an inode ID to work with.
3524        // In this case, we have to try to resolve the path and hope it
3525        // hasn't changed or been deleted since the file was opened for write.
3526        inode = dir.getINode(src);
3527      } else {
3528        inode = dir.getInode(fileId);
3529        if (inode != null) src = inode.getFullPathName();
3530      }
3531      final INodeFile file = checkLease(src, holder, inode, fileId);
3532
3533      //
3534      // Remove the block from the pending creates list
3535      //
3536      boolean removed = dir.removeBlock(src, file,
3537          ExtendedBlock.getLocalBlock(b));
3538      if (!removed) {
3539        return true;
3540      }
3541      if(NameNode.stateChangeLog.isDebugEnabled()) {
3542        NameNode.stateChangeLog.debug("BLOCK* NameSystem.abandonBlock: "
3543                                      + b + " is removed from pendingCreates");
3544      }
3545      persistBlocks(src, file, false);
3546    } finally {
3547      writeUnlock();
3548    }
3549    getEditLog().logSync();
3550
3551    return true;
3552  }
3553
3554  private INodeFile checkLease(String src, String holder, INode inode,
3555                               long fileId)
3556      throws LeaseExpiredException, FileNotFoundException {
3557    assert hasReadLock();
3558    final String ident = src + " (inode " + fileId + ")";
3559    if (inode == null) {
3560      Lease lease = leaseManager.getLease(holder);
3561      throw new LeaseExpiredException(
3562          "No lease on " + ident + ": File does not exist. "
3563          + (lease != null ? lease.toString()
3564              : "Holder " + holder + " does not have any open files."));
3565    }
3566    if (!inode.isFile()) {
3567      Lease lease = leaseManager.getLease(holder);
3568      throw new LeaseExpiredException(
3569          "No lease on " + ident + ": INode is not a regular file. "
3570              + (lease != null ? lease.toString()
3571              : "Holder " + holder + " does not have any open files."));
3572    }
3573    final INodeFile file = inode.asFile();
3574    if (!file.isUnderConstruction()) {
3575      Lease lease = leaseManager.getLease(holder);
3576      throw new LeaseExpiredException(
3577          "No lease on " + ident + ": File is not open for writing. "
3578          + (lease != null ? lease.toString()
3579              : "Holder " + holder + " does not have any open files."));
3580    }
3581    // No further modification is allowed on a deleted file.
3582    // A file is considered deleted, if it is not in the inodeMap or is marked
3583    // as deleted in the snapshot feature.
3584    if (isFileDeleted(file)) {
3585      throw new FileNotFoundException(src);
3586    }
3587    String clientName = file.getFileUnderConstructionFeature().getClientName();
3588    if (holder != null && !clientName.equals(holder)) {
3589      throw new LeaseExpiredException("Lease mismatch on " + ident +
3590          " owned by " + clientName + " but is accessed by " + holder);
3591    }
3592    return file;
3593  }
3594 
3595  /**
3596   * Complete in-progress write to the given file.
3597   * @return true if successful, false if the client should continue to retry
3598   *         (e.g if not all blocks have reached minimum replication yet)
3599   * @throws IOException on error (eg lease mismatch, file not open, file deleted)
3600   */
3601  boolean completeFile(final String srcArg, String holder,
3602                       ExtendedBlock last, long fileId)
3603    throws SafeModeException, UnresolvedLinkException, IOException {
3604    String src = srcArg;
3605    if (NameNode.stateChangeLog.isDebugEnabled()) {
3606      NameNode.stateChangeLog.debug("DIR* NameSystem.completeFile: " +
3607          src + " for " + holder);
3608    }
3609    checkBlock(last);
3610    boolean success = false;
3611    checkOperation(OperationCategory.WRITE);
3612    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3613    waitForLoadingFSImage();
3614    writeLock();
3615    try {
3616      checkOperation(OperationCategory.WRITE);
3617      checkNameNodeSafeMode("Cannot complete file " + src);
3618      src = resolvePath(src, pathComponents);
3619      success = completeFileInternal(src, holder,
3620        ExtendedBlock.getLocalBlock(last), fileId);
3621    } finally {
3622      writeUnlock();
3623    }
3624    getEditLog().logSync();
3625    if (success) {
3626      NameNode.stateChangeLog.info("DIR* completeFile: " + srcArg
3627          + " is closed by " + holder);
3628    }
3629    return success;
3630  }
3631
3632  private boolean completeFileInternal(String src, 
3633      String holder, Block last, long fileId) throws SafeModeException,
3634      UnresolvedLinkException, IOException {
3635    assert hasWriteLock();
3636    final INodeFile pendingFile;
3637    try {
3638      final INode inode;
3639      if (fileId == INodeId.GRANDFATHER_INODE_ID) {
3640        // Older clients may not have given us an inode ID to work with.
3641        // In this case, we have to try to resolve the path and hope it
3642        // hasn't changed or been deleted since the file was opened for write.
3643        final INodesInPath iip = dir.getLastINodeInPath(src);
3644        inode = iip.getINode(0);
3645      } else {
3646        inode = dir.getInode(fileId);
3647        if (inode != null) src = inode.getFullPathName();
3648      }
3649      pendingFile = checkLease(src, holder, inode, fileId);
3650    } catch (LeaseExpiredException lee) {
3651      final INode inode = dir.getINode(src);
3652      if (inode != null
3653          && inode.isFile()
3654          && !inode.asFile().isUnderConstruction()) {
3655        // This could be a retry RPC - i.e the client tried to close
3656        // the file, but missed the RPC response. Thus, it is trying
3657        // again to close the file. If the file still exists and
3658        // the client's view of the last block matches the actual
3659        // last block, then we'll treat it as a successful close.
3660        // See HDFS-3031.
3661        final Block realLastBlock = inode.asFile().getLastBlock();
3662        if (Block.matchingIdAndGenStamp(last, realLastBlock)) {
3663          NameNode.stateChangeLog.info("DIR* completeFile: " +
3664              "request from " + holder + " to complete inode " + fileId +
3665              "(" + src + ") which is already closed. But, it appears to be " +
3666              "an RPC retry. Returning success");
3667          return true;
3668        }
3669      }
3670      throw lee;
3671    }
3672    // Check the state of the penultimate block. It should be completed
3673    // before attempting to complete the last one.
3674    if (!checkFileProgress(pendingFile, false)) {
3675      return false;
3676    }
3677
3678    // commit the last block and complete it if it has minimum replicas
3679    commitOrCompleteLastBlock(pendingFile, last);
3680
3681    if (!checkFileProgress(pendingFile, true)) {
3682      return false;
3683    }
3684
3685    finalizeINodeFileUnderConstruction(src, pendingFile,
3686        Snapshot.CURRENT_STATE_ID);
3687    return true;
3688  }
3689
3690  /**
3691   * Save allocated block at the given pending filename
3692   * 
3693   * @param src path to the file
3694   * @param inodesInPath representing each of the components of src.
3695   *                     The last INode is the INode for {@code src} file.
3696   * @param newBlock newly allocated block to be save
3697   * @param targets target datanodes where replicas of the new block is placed
3698   * @throws QuotaExceededException If addition of block exceeds space quota
3699   */
3700  BlockInfo saveAllocatedBlock(String src, INodesInPath inodes,
3701      Block newBlock, DatanodeStorageInfo[] targets)
3702          throws IOException {
3703    assert hasWriteLock();
3704    BlockInfo b = dir.addBlock(src, inodes, newBlock, targets);
3705    NameNode.stateChangeLog.info("BLOCK* allocateBlock: " + src + ". "
3706        + getBlockPoolId() + " " + b);
3707    DatanodeStorageInfo.incrementBlocksScheduled(targets);
3708    return b;
3709  }
3710
3711  /**
3712   * Create new block with a unique block id and a new generation stamp.
3713   */
3714  Block createNewBlock() throws IOException {
3715    assert hasWriteLock();
3716    Block b = new Block(nextBlockId(), 0, 0);
3717    // Increment the generation stamp for every new block.
3718    b.setGenerationStamp(nextGenerationStamp(false));
3719    return b;
3720  }
3721
3722  /**
3723   * Check that the indicated file's blocks are present and
3724   * replicated.  If not, return false. If checkall is true, then check
3725   * all blocks, otherwise check only penultimate block.
3726   */
3727  boolean checkFileProgress(INodeFile v, boolean checkall) {
3728    readLock();
3729    try {
3730      if (checkall) {
3731        //
3732        // check all blocks of the file.
3733        //
3734        for (BlockInfo block: v.getBlocks()) {
3735          if (!block.isComplete()) {
3736            LOG.info("BLOCK* checkFileProgress: " + block
3737                + " has not reached minimal replication "
3738                + blockManager.minReplication);
3739            return false;
3740          }
3741        }
3742      } else {
3743        //
3744        // check the penultimate block of this file
3745        //
3746        BlockInfo b = v.getPenultimateBlock();
3747        if (b != null && !b.isComplete()) {
3748          LOG.warn("BLOCK* checkFileProgress: " + b
3749              + " has not reached minimal replication "
3750              + blockManager.minReplication);
3751          return false;
3752        }
3753      }
3754      return true;
3755    } finally {
3756      readUnlock();
3757    }
3758  }
3759
3760  ////////////////////////////////////////////////////////////////
3761  // Here's how to handle block-copy failure during client write:
3762  // -- As usual, the client's write should result in a streaming
3763  // backup write to a k-machine sequence.
3764  // -- If one of the backup machines fails, no worries.  Fail silently.
3765  // -- Before client is allowed to close and finalize file, make sure
3766  // that the blocks are backed up.  Namenode may have to issue specific backup
3767  // commands to make up for earlier datanode failures.  Once all copies
3768  // are made, edit namespace and return to client.
3769  ////////////////////////////////////////////////////////////////
3770
3771  /** 
3772   * Change the indicated filename. 
3773   * @deprecated Use {@link #renameTo(String, String, Options.Rename...)} instead.
3774   */
3775  @Deprecated
3776  boolean renameTo(String src, String dst) 
3777      throws IOException, UnresolvedLinkException {
3778    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
3779    if (cacheEntry != null && cacheEntry.isSuccess()) {
3780      return true; // Return previous response
3781    }
3782    boolean ret = false;
3783    try {
3784      ret = renameToInt(src, dst, cacheEntry != null);
3785    } catch (AccessControlException e) {
3786      logAuditEvent(false, "rename", src, dst, null);
3787      throw e;
3788    } finally {
3789      RetryCache.setState(cacheEntry, ret);
3790    }
3791    return ret;
3792  }
3793
3794  private boolean renameToInt(final String srcArg, final String dstArg,
3795    boolean logRetryCache)
3796    throws IOException, UnresolvedLinkException {
3797    String src = srcArg;
3798    String dst = dstArg;
3799    if (NameNode.stateChangeLog.isDebugEnabled()) {
3800      NameNode.stateChangeLog.debug("DIR* NameSystem.renameTo: " + src +
3801          " to " + dst);
3802    }
3803    if (!DFSUtil.isValidName(dst)) {
3804      throw new IOException("Invalid name: " + dst);
3805    }
3806    FSPermissionChecker pc = getPermissionChecker();
3807    byte[][] srcComponents = FSDirectory.getPathComponentsForReservedPath(src);
3808    byte[][] dstComponents = FSDirectory.getPathComponentsForReservedPath(dst);
3809    boolean status = false;
3810    HdfsFileStatus resultingStat = null;
3811    writeLock();
3812    try {
3813      checkOperation(OperationCategory.WRITE);
3814      checkNameNodeSafeMode("Cannot rename " + src);
3815      waitForLoadingFSImage();
3816      src = resolvePath(src, srcComponents);
3817      dst = resolvePath(dst, dstComponents);
3818      checkOperation(OperationCategory.WRITE);
3819      status = renameToInternal(pc, src, dst, logRetryCache);
3820      if (status) {
3821        resultingStat = getAuditFileInfo(dst, false);
3822      }
3823    } finally {
3824      writeUnlock();
3825    }
3826    getEditLog().logSync();
3827    if (status) {
3828      logAuditEvent(true, "rename", srcArg, dstArg, resultingStat);
3829    }
3830    return status;
3831  }
3832
3833  /** @deprecated See {@link #renameTo(String, String)} */
3834  @Deprecated
3835  private boolean renameToInternal(FSPermissionChecker pc, String src,
3836      String dst, boolean logRetryCache) throws IOException,
3837      UnresolvedLinkException {
3838    assert hasWriteLock();
3839    if (isPermissionEnabled) {
3840      //We should not be doing this.  This is move() not renameTo().
3841      //but for now,
3842      //NOTE: yes, this is bad!  it's assuming much lower level behavior
3843      //      of rewriting the dst
3844      String actualdst = dir.isDir(dst)?
3845          dst + Path.SEPARATOR + new Path(src).getName(): dst;
3846      // Rename does not operates on link targets
3847      // Do not resolveLink when checking permissions of src and dst
3848      // Check write access to parent of src
3849      checkPermission(pc, src, false, null, FsAction.WRITE, null, null,
3850          false, false);
3851      // Check write access to ancestor of dst
3852      checkPermission(pc, actualdst, false, FsAction.WRITE, null, null, null,
3853          false, false);
3854    }
3855
3856    long mtime = now();
3857    if (dir.renameTo(src, dst, mtime)) {
3858      getEditLog().logRename(src, dst, mtime, logRetryCache);
3859      return true;
3860    }
3861    return false;
3862  }
3863  
3864
3865  /** Rename src to dst */
3866  void renameTo(final String srcArg, final String dstArg,
3867      Options.Rename... options) throws IOException, UnresolvedLinkException {
3868    String src = srcArg;
3869    String dst = dstArg;
3870    if (NameNode.stateChangeLog.isDebugEnabled()) {
3871      NameNode.stateChangeLog.debug("DIR* NameSystem.renameTo: with options - "
3872          + src + " to " + dst);
3873    }
3874    if (!DFSUtil.isValidName(dst)) {
3875      throw new InvalidPathException("Invalid name: " + dst);
3876    }
3877    final FSPermissionChecker pc = getPermissionChecker();
3878    
3879    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
3880    if (cacheEntry != null && cacheEntry.isSuccess()) {
3881      return; // Return previous response
3882    }
3883    byte[][] srcComponents = FSDirectory.getPathComponentsForReservedPath(src);
3884    byte[][] dstComponents = FSDirectory.getPathComponentsForReservedPath(dst);
3885    HdfsFileStatus resultingStat = null;
3886    boolean success = false;
3887    writeLock();
3888    BlocksMapUpdateInfo collectedBlocks = new BlocksMapUpdateInfo();
3889    try {
3890      checkOperation(OperationCategory.WRITE);
3891      checkNameNodeSafeMode("Cannot rename " + src);
3892      src = resolvePath(src, srcComponents);
3893      dst = resolvePath(dst, dstComponents);
3894      renameToInternal(pc, src, dst, cacheEntry != null, 
3895          collectedBlocks, options);
3896      resultingStat = getAuditFileInfo(dst, false);
3897      success = true;
3898    } finally {
3899      writeUnlock();
3900      RetryCache.setState(cacheEntry, success);
3901    }
3902    getEditLog().logSync();
3903    if (!collectedBlocks.getToDeleteList().isEmpty()) {
3904      removeBlocks(collectedBlocks);
3905      collectedBlocks.clear();
3906    }
3907    if (resultingStat != null) {
3908      StringBuilder cmd = new StringBuilder("rename options=");
3909      for (Rename option : options) {
3910        cmd.append(option.value()).append(" ");
3911      }
3912      logAuditEvent(true, cmd.toString(), srcArg, dstArg, resultingStat);
3913    }
3914  }
3915
3916  private void renameToInternal(FSPermissionChecker pc, String src, 
3917      String dst, boolean logRetryCache, BlocksMapUpdateInfo collectedBlocks, 
3918      Options.Rename... options) throws IOException {
3919    assert hasWriteLock();
3920    if (isPermissionEnabled) {
3921      // Rename does not operates on link targets
3922      // Do not resolveLink when checking permissions of src and dst
3923      // Check write access to parent of src
3924      checkPermission(pc, src, false, null, FsAction.WRITE, null, null, false,
3925          false);
3926      // Check write access to ancestor of dst
3927      checkPermission(pc, dst, false, FsAction.WRITE, null, null, null, false,
3928          false);
3929    }
3930
3931    waitForLoadingFSImage();
3932    long mtime = now();
3933    dir.renameTo(src, dst, mtime, collectedBlocks, options);
3934    getEditLog().logRename(src, dst, mtime, logRetryCache, options);
3935  }
3936  
3937  /**
3938   * Remove the indicated file from namespace.
3939   * 
3940   * @see ClientProtocol#delete(String, boolean) for detailed description and 
3941   * description of exceptions
3942   */
3943  boolean delete(String src, boolean recursive)
3944      throws AccessControlException, SafeModeException,
3945      UnresolvedLinkException, IOException {
3946    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
3947    if (cacheEntry != null && cacheEntry.isSuccess()) {
3948      return true; // Return previous response
3949    }
3950    boolean ret = false;
3951    try {
3952      ret = deleteInt(src, recursive, cacheEntry != null);
3953    } catch (AccessControlException e) {
3954      logAuditEvent(false, "delete", src);
3955      throw e;
3956    } finally {
3957      RetryCache.setState(cacheEntry, ret);
3958    }
3959    return ret;
3960  }
3961      
3962  private boolean deleteInt(String src, boolean recursive, boolean logRetryCache)
3963      throws AccessControlException, SafeModeException,
3964      UnresolvedLinkException, IOException {
3965    if (NameNode.stateChangeLog.isDebugEnabled()) {
3966      NameNode.stateChangeLog.debug("DIR* NameSystem.delete: " + src);
3967    }
3968    boolean status = deleteInternal(src, recursive, true, logRetryCache);
3969    if (status) {
3970      logAuditEvent(true, "delete", src);
3971    }
3972    return status;
3973  }
3974    
3975  private FSPermissionChecker getPermissionChecker()
3976      throws AccessControlException {
3977    try {
3978      return new FSPermissionChecker(fsOwnerShortUserName, supergroup, getRemoteUser());
3979    } catch (IOException ioe) {
3980      throw new AccessControlException(ioe);
3981    }
3982  }
3983  
3984  /**
3985   * Remove a file/directory from the namespace.
3986   * <p>
3987   * For large directories, deletion is incremental. The blocks under
3988   * the directory are collected and deleted a small number at a time holding
3989   * the {@link FSNamesystem} lock.
3990   * <p>
3991   * For small directory or file the deletion is done in one shot.
3992   * 
3993   * @see ClientProtocol#delete(String, boolean) for description of exceptions
3994   */
3995  private boolean deleteInternal(String src, boolean recursive,
3996      boolean enforcePermission, boolean logRetryCache)
3997      throws AccessControlException, SafeModeException, UnresolvedLinkException,
3998             IOException {
3999    BlocksMapUpdateInfo collectedBlocks = new BlocksMapUpdateInfo();
4000    List<INode> removedINodes = new ChunkedArrayList<INode>();
4001    FSPermissionChecker pc = getPermissionChecker();
4002    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
4003    boolean ret = false;
4004
4005    waitForLoadingFSImage();
4006    writeLock();
4007    try {
4008      checkOperation(OperationCategory.WRITE);
4009      checkNameNodeSafeMode("Cannot delete " + src);
4010      src = resolvePath(src, pathComponents);
4011      if (!recursive && dir.isNonEmptyDirectory(src)) {
4012        throw new PathIsNotEmptyDirectoryException(src + " is non empty");
4013      }
4014      if (enforcePermission && isPermissionEnabled) {
4015        checkPermission(pc, src, false, null, FsAction.WRITE, null,
4016            FsAction.ALL, true, false);
4017      }
4018
4019      long mtime = now();
4020      // Unlink the target directory from directory tree
4021      long filesRemoved = dir.delete(src, collectedBlocks, removedINodes,
4022              mtime);
4023      if (filesRemoved < 0) {
4024        return false;
4025      }
4026      getEditLog().logDelete(src, mtime, logRetryCache);
4027      incrDeletedFileCount(filesRemoved);
4028      // Blocks/INodes will be handled later
4029      removePathAndBlocks(src, null, removedINodes, true);
4030      ret = true;
4031    } finally {
4032      writeUnlock();
4033    }
4034    getEditLog().logSync(); 
4035    removeBlocks(collectedBlocks); // Incremental deletion of blocks
4036    collectedBlocks.clear();
4037
4038    if (NameNode.stateChangeLog.isDebugEnabled()) {
4039      NameNode.stateChangeLog.debug("DIR* Namesystem.delete: "
4040        + src +" is removed");
4041    }
4042    return ret;
4043  }
4044
4045  /**
4046   * From the given list, incrementally remove the blocks from blockManager
4047   * Writelock is dropped and reacquired every BLOCK_DELETION_INCREMENT to
4048   * ensure that other waiters on the lock can get in. See HDFS-2938
4049   * 
4050   * @param blocks
4051   *          An instance of {@link BlocksMapUpdateInfo} which contains a list
4052   *          of blocks that need to be removed from blocksMap
4053   */
4054  void removeBlocks(BlocksMapUpdateInfo blocks) {
4055    List<Block> toDeleteList = blocks.getToDeleteList();
4056    Iterator<Block> iter = toDeleteList.iterator();
4057    while (iter.hasNext()) {
4058      writeLock();
4059      try {
4060        for (int i = 0; i < BLOCK_DELETION_INCREMENT && iter.hasNext(); i++) {
4061          blockManager.removeBlock(iter.next());
4062        }
4063      } finally {
4064        writeUnlock();
4065      }
4066    }
4067  }
4068  
4069  /**
4070   * Remove leases, inodes and blocks related to a given path
4071   * @param src The given path
4072   * @param blocks Containing the list of blocks to be deleted from blocksMap
4073   * @param removedINodes Containing the list of inodes to be removed from 
4074   *                      inodesMap
4075   * @param acquireINodeMapLock Whether to acquire the lock for inode removal
4076   */
4077  void removePathAndBlocks(String src, BlocksMapUpdateInfo blocks,
4078      List<INode> removedINodes, final boolean acquireINodeMapLock) {
4079    assert hasWriteLock();
4080    leaseManager.removeLeaseWithPrefixPath(src);
4081    // remove inodes from inodesMap
4082    if (removedINodes != null) {
4083      if (acquireINodeMapLock) {
4084        dir.writeLock();
4085      }
4086      try {
4087        dir.removeFromInodeMap(removedINodes);
4088      } finally {
4089        if (acquireINodeMapLock) {
4090          dir.writeUnlock();
4091        }
4092      }
4093      removedINodes.clear();
4094    }
4095    if (blocks == null) {
4096      return;
4097    }
4098    
4099    removeBlocksAndUpdateSafemodeTotal(blocks);
4100  }
4101
4102  /**
4103   * Removes the blocks from blocksmap and updates the safemode blocks total
4104   * 
4105   * @param blocks
4106   *          An instance of {@link BlocksMapUpdateInfo} which contains a list
4107   *          of blocks that need to be removed from blocksMap
4108   */
4109  void removeBlocksAndUpdateSafemodeTotal(BlocksMapUpdateInfo blocks) {
4110    assert hasWriteLock();
4111    // In the case that we are a Standby tailing edits from the
4112    // active while in safe-mode, we need to track the total number
4113    // of blocks and safe blocks in the system.
4114    boolean trackBlockCounts = isSafeModeTrackingBlocks();
4115    int numRemovedComplete = 0, numRemovedSafe = 0;
4116
4117    for (Block b : blocks.getToDeleteList()) {
4118      if (trackBlockCounts) {
4119        BlockInfo bi = getStoredBlock(b);
4120        if (bi.isComplete()) {
4121          numRemovedComplete++;
4122          if (bi.numNodes() >= blockManager.minReplication) {
4123            numRemovedSafe++;
4124          }
4125        }
4126      }
4127      blockManager.removeBlock(b);
4128    }
4129    if (trackBlockCounts) {
4130      if (LOG.isDebugEnabled()) {
4131        LOG.debug("Adjusting safe-mode totals for deletion."
4132            + "decreasing safeBlocks by " + numRemovedSafe
4133            + ", totalBlocks by " + numRemovedComplete);
4134      }
4135      adjustSafeModeBlockTotals(-numRemovedSafe, -numRemovedComplete);
4136    }
4137  }
4138
4139  /**
4140   * @see SafeModeInfo#shouldIncrementallyTrackBlocks
4141   */
4142  private boolean isSafeModeTrackingBlocks() {
4143    if (!haEnabled) {
4144      // Never track blocks incrementally in non-HA code.
4145      return false;
4146    }
4147    SafeModeInfo sm = this.safeMode;
4148    return sm != null && sm.shouldIncrementallyTrackBlocks();
4149  }
4150
4151  /**
4152   * Get the file info for a specific file.
4153   *
4154   * @param srcArg The string representation of the path to the file
4155   * @param resolveLink whether to throw UnresolvedLinkException 
4156   *        if src refers to a symlink
4157   *
4158   * @throws AccessControlException if access is denied
4159   * @throws UnresolvedLinkException if a symlink is encountered.
4160   *
4161   * @return object containing information regarding the file
4162   *         or null if file not found
4163   * @throws StandbyException 
4164   */
4165  HdfsFileStatus getFileInfo(final String srcArg, boolean resolveLink)
4166    throws AccessControlException, UnresolvedLinkException,
4167           StandbyException, IOException {
4168    String src = srcArg;
4169    if (!DFSUtil.isValidName(src)) {
4170      throw new InvalidPathException("Invalid file name: " + src);
4171    }
4172    HdfsFileStatus stat = null;
4173    FSPermissionChecker pc = getPermissionChecker();
4174    checkOperation(OperationCategory.READ);
4175    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
4176    readLock();
4177    try {
4178      checkOperation(OperationCategory.READ);
4179      src = resolvePath(src, pathComponents);
4180      boolean isSuperUser = true;
4181      if (isPermissionEnabled) {
4182        checkPermission(pc, src, false, null, null, null, null, false,
4183            resolveLink);
4184        isSuperUser = pc.isSuperUser();
4185      }
4186      stat = dir.getFileInfo(src, resolveLink,
4187          FSDirectory.isReservedRawName(srcArg), isSuperUser);
4188    } catch (AccessControlException e) {
4189      logAuditEvent(false, "getfileinfo", srcArg);
4190      throw e;
4191    } finally {
4192      readUnlock();
4193    }
4194    logAuditEvent(true, "getfileinfo", srcArg);
4195    return stat;
4196  }
4197  
4198  /**
4199   * Returns true if the file is closed
4200   */
4201  boolean isFileClosed(final String srcArg)
4202      throws AccessControlException, UnresolvedLinkException,
4203      StandbyException, IOException {
4204    String src = srcArg;
4205    FSPermissionChecker pc = getPermissionChecker();  
4206    checkOperation(OperationCategory.READ);
4207    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
4208    readLock();
4209    try {
4210      src = resolvePath(src, pathComponents);
4211      checkOperation(OperationCategory.READ);
4212      if (isPermissionEnabled) {
4213        checkTraverse(pc, src);
4214      }
4215      return !INodeFile.valueOf(dir.getINode(src), src).isUnderConstruction();
4216    } catch (AccessControlException e) {
4217      if (isAuditEnabled() && isExternalInvocation()) {
4218        logAuditEvent(false, "isFileClosed", srcArg);
4219      }
4220      throw e;
4221    } finally {
4222      readUnlock();
4223    }
4224  }
4225
4226  /**
4227   * Create all the necessary directories
4228   */
4229  boolean mkdirs(String src, PermissionStatus permissions,
4230      boolean createParent) throws IOException, UnresolvedLinkException {
4231    boolean ret = false;
4232    try {
4233      ret = mkdirsInt(src, permissions, createParent);
4234    } catch (AccessControlException e) {
4235      logAuditEvent(false, "mkdirs", src);
4236      throw e;
4237    }
4238    return ret;
4239  }
4240
4241  private boolean mkdirsInt(final String srcArg, PermissionStatus permissions,
4242      boolean createParent) throws IOException, UnresolvedLinkException {
4243    String src = srcArg;
4244    if(NameNode.stateChangeLog.isDebugEnabled()) {
4245      NameNode.stateChangeLog.debug("DIR* NameSystem.mkdirs: " + src);
4246    }
4247    if (!DFSUtil.isValidName(src)) {
4248      throw new InvalidPathException(src);
4249    }
4250    FSPermissionChecker pc = getPermissionChecker();
4251    checkOperation(OperationCategory.WRITE);
4252    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
4253    HdfsFileStatus resultingStat = null;
4254    boolean status = false;
4255    writeLock();
4256    try {
4257      checkOperation(OperationCategory.WRITE);   
4258      checkNameNodeSafeMode("Cannot create directory " + src);
4259      src = resolvePath(src, pathComponents);
4260      status = mkdirsInternal(pc, src, permissions, createParent);
4261      if (status) {
4262        resultingStat = getAuditFileInfo(src, false);
4263      }
4264    } finally {
4265      writeUnlock();
4266    }
4267    getEditLog().logSync();
4268    if (status) {
4269      logAuditEvent(true, "mkdirs", srcArg, null, resultingStat);
4270    }
4271    return status;
4272  }
4273    
4274  /**
4275   * Create all the necessary directories
4276   */
4277  private boolean mkdirsInternal(FSPermissionChecker pc, String src,
4278      PermissionStatus permissions, boolean createParent) 
4279      throws IOException, UnresolvedLinkException {
4280    assert hasWriteLock();
4281    if (isPermissionEnabled) {
4282      checkTraverse(pc, src);
4283    }
4284    if (dir.isDirMutable(src)) {
4285      // all the users of mkdirs() are used to expect 'true' even if
4286      // a new directory is not created.
4287      return true;
4288    }
4289    if (isPermissionEnabled) {
4290      checkAncestorAccess(pc, src, FsAction.WRITE);
4291    }
4292    if (!createParent) {
4293      verifyParentDir(src);
4294    }
4295
4296    // validate that we have enough inodes. This is, at best, a 
4297    // heuristic because the mkdirs() operation might need to 
4298    // create multiple inodes.
4299    checkFsObjectLimit();
4300
4301    if (!mkdirsRecursively(src, permissions, false, now())) {
4302      throw new IOException("Failed to create directory: " + src);
4303    }
4304    return true;
4305  }
4306
4307  /**
4308   * Create a directory
4309   * If ancestor directories do not exist, automatically create them.
4310
4311   * @param src string representation of the path to the directory
4312   * @param permissions the permission of the directory
4313   * @param inheritPermission if the permission of the directory should inherit
4314   *                          from its parent or not. u+wx is implicitly added to
4315   *                          the automatically created directories, and to the
4316   *                          given directory if inheritPermission is true
4317   * @param now creation time
4318   * @return true if the operation succeeds false otherwise
4319   * @throws QuotaExceededException if directory creation violates
4320   *                                any quota limit
4321   * @throws UnresolvedLinkException if a symlink is encountered in src.
4322   * @throws SnapshotAccessControlException if path is in RO snapshot
4323   */
4324  private boolean mkdirsRecursively(String src, PermissionStatus permissions,
4325                 boolean inheritPermission, long now)
4326          throws FileAlreadyExistsException, QuotaExceededException,
4327                 UnresolvedLinkException, SnapshotAccessControlException,
4328                 AclException {
4329    src = FSDirectory.normalizePath(src);
4330    byte[][] components = INode.getPathComponents(src);
4331    final int lastInodeIndex = components.length - 1;
4332
4333    dir.writeLock();
4334    try {
4335      INodesInPath iip = dir.getExistingPathINodes(components);
4336      if (iip.isSnapshot()) {
4337        throw new SnapshotAccessControlException(
4338                "Modification on RO snapshot is disallowed");
4339      }
4340      INode[] inodes = iip.getINodes();
4341
4342      // find the index of the first null in inodes[]
4343      StringBuilder pathbuilder = new StringBuilder();
4344      int i = 1;
4345      for(; i < inodes.length && inodes[i] != null; i++) {
4346        pathbuilder.append(Path.SEPARATOR).
4347            append(DFSUtil.bytes2String(components[i]));
4348        if (!inodes[i].isDirectory()) {
4349          throw new FileAlreadyExistsException(
4350                  "Parent path is not a directory: "
4351                  + pathbuilder + " "+inodes[i].getLocalName());
4352        }
4353      }
4354
4355      // default to creating parent dirs with the given perms
4356      PermissionStatus parentPermissions = permissions;
4357
4358      // if not inheriting and it's the last inode, there's no use in
4359      // computing perms that won't be used
4360      if (inheritPermission || (i < lastInodeIndex)) {
4361        // if inheriting (ie. creating a file or symlink), use the parent dir,
4362        // else the supplied permissions
4363        // NOTE: the permissions of the auto-created directories violate posix
4364        FsPermission parentFsPerm = inheritPermission
4365                ? inodes[i-1].getFsPermission() : permissions.getPermission();
4366
4367        // ensure that the permissions allow user write+execute
4368        if (!parentFsPerm.getUserAction().implies(FsAction.WRITE_EXECUTE)) {
4369          parentFsPerm = new FsPermission(
4370                  parentFsPerm.getUserAction().or(FsAction.WRITE_EXECUTE),
4371                  parentFsPerm.getGroupAction(),
4372                  parentFsPerm.getOtherAction()
4373          );
4374        }
4375
4376        if (!parentPermissions.getPermission().equals(parentFsPerm)) {
4377          parentPermissions = new PermissionStatus(
4378                  parentPermissions.getUserName(),
4379                  parentPermissions.getGroupName(),
4380                  parentFsPerm
4381          );
4382          // when inheriting, use same perms for entire path
4383          if (inheritPermission) permissions = parentPermissions;
4384        }
4385      }
4386
4387      // create directories beginning from the first null index
4388      for(; i < inodes.length; i++) {
4389        pathbuilder.append(Path.SEPARATOR).
4390            append(DFSUtil.bytes2String(components[i]));
4391        dir.unprotectedMkdir(allocateNewInodeId(), iip, i, components[i],
4392                (i < lastInodeIndex) ? parentPermissions : permissions, null,
4393                now);
4394        if (inodes[i] == null) {
4395          return false;
4396        }
4397        // Directory creation also count towards FilesCreated
4398        // to match count of FilesDeleted metric.
4399        NameNode.getNameNodeMetrics().incrFilesCreated();
4400
4401        final String cur = pathbuilder.toString();
4402        getEditLog().logMkDir(cur, inodes[i]);
4403        if(NameNode.stateChangeLog.isDebugEnabled()) {
4404          NameNode.stateChangeLog.debug(
4405                  "mkdirs: created directory " + cur);
4406        }
4407      }
4408    } finally {
4409      dir.writeUnlock();
4410    }
4411    return true;
4412  }
4413
4414  /**
4415   * Get the content summary for a specific file/dir.
4416   *
4417   * @param srcArg The string representation of the path to the file
4418   *
4419   * @throws AccessControlException if access is denied
4420   * @throws UnresolvedLinkException if a symlink is encountered.
4421   * @throws FileNotFoundException if no file exists
4422   * @throws StandbyException
4423   * @throws IOException for issues with writing to the audit log
4424   *
4425   * @return object containing information regarding the file
4426   *         or null if file not found
4427   */
4428  ContentSummary getContentSummary(final String srcArg) throws IOException {
4429    String src = srcArg;
4430    FSPermissionChecker pc = getPermissionChecker();
4431    checkOperation(OperationCategory.READ);
4432    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
4433    readLock();
4434    boolean success = true;
4435    try {
4436      checkOperation(OperationCategory.READ);
4437      src = resolvePath(src, pathComponents);
4438      if (isPermissionEnabled) {
4439        checkPermission(pc, src, false, null, null, null, FsAction.READ_EXECUTE);
4440      }
4441      return dir.getContentSummary(src);
4442
4443    } catch (AccessControlException ace) {
4444      success = false;
4445      throw ace;
4446    } finally {
4447      readUnlock();
4448      logAuditEvent(success, "contentSummary", srcArg);
4449    }
4450  }
4451
4452  /**
4453   * Set the namespace quota and diskspace quota for a directory.
4454   * See {@link ClientProtocol#setQuota(String, long, long)} for the 
4455   * contract.
4456   * 
4457   * Note: This does not support ".inodes" relative path.
4458   */
4459  void setQuota(String path, long nsQuota, long dsQuota)
4460      throws IOException, UnresolvedLinkException {
4461    checkSuperuserPrivilege();
4462    checkOperation(OperationCategory.WRITE);
4463    writeLock();
4464    try {
4465      checkOperation(OperationCategory.WRITE);
4466      checkNameNodeSafeMode("Cannot set quota on " + path);
4467      INodeDirectory changed = dir.setQuota(path, nsQuota, dsQuota);
4468      if (changed != null) {
4469        final Quota.Counts q = changed.getQuotaCounts();
4470        getEditLog().logSetQuota(path,
4471                q.get(Quota.NAMESPACE), q.get(Quota.DISKSPACE));
4472      }
4473    } finally {
4474      writeUnlock();
4475    }
4476    getEditLog().logSync();
4477  }
4478
4479  /** Persist all metadata about this file.
4480   * @param src The string representation of the path
4481   * @param fileId The inode ID that we're fsyncing.  Older clients will pass
4482   *               INodeId.GRANDFATHER_INODE_ID here.
4483   * @param clientName The string representation of the client
4484   * @param lastBlockLength The length of the last block 
4485   *                        under construction reported from client.
4486   * @throws IOException if path does not exist
4487   */
4488  void fsync(String src, long fileId, String clientName, long lastBlockLength)
4489      throws IOException, UnresolvedLinkException {
4490    NameNode.stateChangeLog.info("BLOCK* fsync: " + src + " for " + clientName);
4491    checkOperation(OperationCategory.WRITE);
4492    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
4493
4494    waitForLoadingFSImage();
4495    writeLock();
4496    try {
4497      checkOperation(OperationCategory.WRITE);
4498      checkNameNodeSafeMode("Cannot fsync file " + src);
4499      src = resolvePath(src, pathComponents);
4500      final INode inode;
4501      if (fileId == INodeId.GRANDFATHER_INODE_ID) {
4502        // Older clients may not have given us an inode ID to work with.
4503        // In this case, we have to try to resolve the path and hope it
4504        // hasn't changed or been deleted since the file was opened for write.
4505        inode = dir.getINode(src);
4506      } else {
4507        inode = dir.getInode(fileId);
4508        if (inode != null) src = inode.getFullPathName();
4509      }
4510      final INodeFile pendingFile = checkLease(src, clientName, inode, fileId);
4511      if (lastBlockLength > 0) {
4512        pendingFile.getFileUnderConstructionFeature().updateLengthOfLastBlock(
4513            pendingFile, lastBlockLength);
4514      }
4515      persistBlocks(src, pendingFile, false);
4516    } finally {
4517      writeUnlock();
4518    }
4519    getEditLog().logSync();
4520  }
4521
4522  /**
4523   * Move a file that is being written to be immutable.
4524   * @param src The filename
4525   * @param lease The lease for the client creating the file
4526   * @param recoveryLeaseHolder reassign lease to this holder if the last block
4527   *        needs recovery; keep current holder if null.
4528   * @throws AlreadyBeingCreatedException if file is waiting to achieve minimal
4529   *         replication;<br>
4530   *         RecoveryInProgressException if lease recovery is in progress.<br>
4531   *         IOException in case of an error.
4532   * @return true  if file has been successfully finalized and closed or 
4533   *         false if block recovery has been initiated. Since the lease owner
4534   *         has been changed and logged, caller should call logSync().
4535   */
4536  boolean internalReleaseLease(Lease lease, String src, 
4537      String recoveryLeaseHolder) throws AlreadyBeingCreatedException, 
4538      IOException, UnresolvedLinkException {
4539    LOG.info("Recovering " + lease + ", src=" + src);
4540    assert !isInSafeMode();
4541    assert hasWriteLock();
4542
4543    final INodesInPath iip = dir.getLastINodeInPath(src);
4544    final INodeFile pendingFile = iip.getINode(0).asFile();
4545    int nrBlocks = pendingFile.numBlocks();
4546    BlockInfo[] blocks = pendingFile.getBlocks();
4547
4548    int nrCompleteBlocks;
4549    BlockInfo curBlock = null;
4550    for(nrCompleteBlocks = 0; nrCompleteBlocks < nrBlocks; nrCompleteBlocks++) {
4551      curBlock = blocks[nrCompleteBlocks];
4552      if(!curBlock.isComplete())
4553        break;
4554      assert blockManager.checkMinReplication(curBlock) :
4555              "A COMPLETE block is not minimally replicated in " + src;
4556    }
4557
4558    // If there are no incomplete blocks associated with this file,
4559    // then reap lease immediately and close the file.
4560    if(nrCompleteBlocks == nrBlocks) {
4561      finalizeINodeFileUnderConstruction(src, pendingFile,
4562          iip.getLatestSnapshotId());
4563      NameNode.stateChangeLog.warn("BLOCK*"
4564        + " internalReleaseLease: All existing blocks are COMPLETE,"
4565        + " lease removed, file closed.");
4566      return true;  // closed!
4567    }
4568
4569    // Only the last and the penultimate blocks may be in non COMPLETE state.
4570    // If the penultimate block is not COMPLETE, then it must be COMMITTED.
4571    if(nrCompleteBlocks < nrBlocks - 2 ||
4572       nrCompleteBlocks == nrBlocks - 2 &&
4573         curBlock != null &&
4574         curBlock.getBlockUCState() != BlockUCState.COMMITTED) {
4575      final String message = "DIR* NameSystem.internalReleaseLease: "
4576        + "attempt to release a create lock on "
4577        + src + " but file is already closed.";
4578      NameNode.stateChangeLog.warn(message);
4579      throw new IOException(message);
4580    }
4581
4582    // The last block is not COMPLETE, and
4583    // that the penultimate block if exists is either COMPLETE or COMMITTED
4584    final BlockInfo lastBlock = pendingFile.getLastBlock();
4585    BlockUCState lastBlockState = lastBlock.getBlockUCState();
4586    BlockInfo penultimateBlock = pendingFile.getPenultimateBlock();
4587
4588    // If penultimate block doesn't exist then its minReplication is met
4589    boolean penultimateBlockMinReplication = penultimateBlock == null ? true :
4590        blockManager.checkMinReplication(penultimateBlock);
4591
4592    switch(lastBlockState) {
4593    case COMPLETE:
4594      assert false : "Already checked that the last block is incomplete";
4595      break;
4596    case COMMITTED:
4597      // Close file if committed blocks are minimally replicated
4598      if(penultimateBlockMinReplication &&
4599          blockManager.checkMinReplication(lastBlock)) {
4600        finalizeINodeFileUnderConstruction(src, pendingFile,
4601            iip.getLatestSnapshotId());
4602        NameNode.stateChangeLog.warn("BLOCK*"
4603          + " internalReleaseLease: Committed blocks are minimally replicated,"
4604          + " lease removed, file closed.");
4605        return true;  // closed!
4606      }
4607      // Cannot close file right now, since some blocks 
4608      // are not yet minimally replicated.
4609      // This may potentially cause infinite loop in lease recovery
4610      // if there are no valid replicas on data-nodes.
4611      String message = "DIR* NameSystem.internalReleaseLease: " +
4612          "Failed to release lease for file " + src +
4613          ". Committed blocks are waiting to be minimally replicated." +
4614          " Try again later.";
4615      NameNode.stateChangeLog.warn(message);
4616      throw new AlreadyBeingCreatedException(message);
4617    case UNDER_CONSTRUCTION:
4618    case UNDER_RECOVERY:
4619      final BlockInfoUnderConstruction uc = (BlockInfoUnderConstruction)lastBlock;
4620      // setup the last block locations from the blockManager if not known
4621      if (uc.getNumExpectedLocations() == 0) {
4622        uc.setExpectedLocations(blockManager.getStorages(lastBlock));
4623      }
4624
4625      if (uc.getNumExpectedLocations() == 0 && uc.getNumBytes() == 0) {
4626        // There is no datanode reported to this block.
4627        // may be client have crashed before writing data to pipeline.
4628        // This blocks doesn't need any recovery.
4629        // We can remove this block and close the file.
4630        pendingFile.removeLastBlock(lastBlock);
4631        finalizeINodeFileUnderConstruction(src, pendingFile,
4632            iip.getLatestSnapshotId());
4633        NameNode.stateChangeLog.warn("BLOCK* internalReleaseLease: "
4634            + "Removed empty last block and closed file.");
4635        return true;
4636      }
4637      // start recovery of the last block for this file
4638      long blockRecoveryId = nextGenerationStamp(isLegacyBlock(uc));
4639      lease = reassignLease(lease, src, recoveryLeaseHolder, pendingFile);
4640      uc.initializeBlockRecovery(blockRecoveryId);
4641      leaseManager.renewLease(lease);
4642      // Cannot close file right now, since the last block requires recovery.
4643      // This may potentially cause infinite loop in lease recovery
4644      // if there are no valid replicas on data-nodes.
4645      NameNode.stateChangeLog.warn(
4646                "DIR* NameSystem.internalReleaseLease: " +
4647                "File " + src + " has not been closed." +
4648               " Lease recovery is in progress. " +
4649                "RecoveryId = " + blockRecoveryId + " for block " + lastBlock);
4650      break;
4651    }
4652    return false;
4653  }
4654
4655  private Lease reassignLease(Lease lease, String src, String newHolder,
4656      INodeFile pendingFile) {
4657    assert hasWriteLock();
4658    if(newHolder == null)
4659      return lease;
4660    // The following transaction is not synced. Make sure it's sync'ed later.
4661    logReassignLease(lease.getHolder(), src, newHolder);
4662    return reassignLeaseInternal(lease, src, newHolder, pendingFile);
4663  }
4664  
4665  Lease reassignLeaseInternal(Lease lease, String src, String newHolder,
4666      INodeFile pendingFile) {
4667    assert hasWriteLock();
4668    pendingFile.getFileUnderConstructionFeature().setClientName(newHolder);
4669    return leaseManager.reassignLease(lease, src, newHolder);
4670  }
4671
4672  private void commitOrCompleteLastBlock(final INodeFile fileINode,
4673      final Block commitBlock) throws IOException {
4674    assert hasWriteLock();
4675    Preconditions.checkArgument(fileINode.isUnderConstruction());
4676    if (!blockManager.commitOrCompleteLastBlock(fileINode, commitBlock)) {
4677      return;
4678    }
4679
4680    // Adjust disk space consumption if required
4681    final long diff = fileINode.getPreferredBlockSize() - commitBlock.getNumBytes();    
4682    if (diff > 0) {
4683      try {
4684        String path = fileINode.getFullPathName();
4685        dir.updateSpaceConsumed(path, 0, -diff*fileINode.getFileReplication());
4686      } catch (IOException e) {
4687        LOG.warn("Unexpected exception while updating disk space.", e);
4688      }
4689    }
4690  }
4691
4692  private void finalizeINodeFileUnderConstruction(String src,
4693      INodeFile pendingFile, int latestSnapshot) throws IOException,
4694      UnresolvedLinkException {
4695    assert hasWriteLock();
4696
4697    FileUnderConstructionFeature uc = pendingFile.getFileUnderConstructionFeature();
4698    if (uc == null) {
4699      throw new IOException("Cannot finalize file " + src
4700          + " because it is not under construction");
4701    }
4702    
4703    pendingFile.recordModification(latestSnapshot);
4704
4705    // The file is no longer pending.
4706    // Create permanent INode, update blocks. No need to replace the inode here
4707    // since we just remove the uc feature from pendingFile
4708    final INodeFile newFile = pendingFile.toCompleteFile(now());
4709
4710    leaseManager.removeLease(uc.getClientName(), src);
4711
4712    waitForLoadingFSImage();
4713    // close file and persist block allocations for this file
4714    closeFile(src, newFile);
4715
4716    blockManager.checkReplication(newFile);
4717  }
4718
4719  @VisibleForTesting
4720  BlockInfo getStoredBlock(Block block) {
4721    return blockManager.getStoredBlock(block);
4722  }
4723  
4724  @Override
4725  public boolean isInSnapshot(BlockInfoUnderConstruction blockUC) {
4726    assert hasReadLock();
4727    final BlockCollection bc = blockUC.getBlockCollection();
4728    if (bc == null || !(bc instanceof INodeFile)
4729        || !bc.isUnderConstruction()) {
4730      return false;
4731    }
4732
4733    INodeFile inodeUC = (INodeFile) bc;
4734    String fullName = inodeUC.getName();
4735    try {
4736      if (fullName != null && fullName.startsWith(Path.SEPARATOR)
4737          && dir.getINode(fullName) == inodeUC) {
4738        // If file exists in normal path then no need to look in snapshot
4739        return false;
4740      }
4741    } catch (UnresolvedLinkException e) {
4742      LOG.error("Error while resolving the link : " + fullName, e);
4743      return false;
4744    }
4745    /*
4746     * 1. if bc is an instance of INodeFileUnderConstructionWithSnapshot, and
4747     * bc is not in the current fsdirectory tree, bc must represent a snapshot
4748     * file. 
4749     * 2. if fullName is not an absolute path, bc cannot be existent in the 
4750     * current fsdirectory tree. 
4751     * 3. if bc is not the current node associated with fullName, bc must be a
4752     * snapshot inode.
4753     */
4754    return true;
4755  }
4756
4757  void commitBlockSynchronization(ExtendedBlock lastblock,
4758      long newgenerationstamp, long newlength,
4759      boolean closeFile, boolean deleteblock, DatanodeID[] newtargets,
4760      String[] newtargetstorages)
4761      throws IOException, UnresolvedLinkException {
4762    LOG.info("commitBlockSynchronization(lastblock=" + lastblock
4763             + ", newgenerationstamp=" + newgenerationstamp
4764             + ", newlength=" + newlength
4765             + ", newtargets=" + Arrays.asList(newtargets)
4766             + ", closeFile=" + closeFile
4767             + ", deleteBlock=" + deleteblock
4768             + ")");
4769    checkOperation(OperationCategory.WRITE);
4770    String src = "";
4771    waitForLoadingFSImage();
4772    writeLock();
4773    try {
4774      checkOperation(OperationCategory.WRITE);
4775      // If a DN tries to commit to the standby, the recovery will
4776      // fail, and the next retry will succeed on the new NN.
4777  
4778      checkNameNodeSafeMode(
4779          "Cannot commitBlockSynchronization while in safe mode");
4780      final BlockInfo storedBlock = getStoredBlock(
4781          ExtendedBlock.getLocalBlock(lastblock));
4782      if (storedBlock == null) {
4783        if (deleteblock) {
4784          // This may be a retry attempt so ignore the failure
4785          // to locate the block.
4786          if (LOG.isDebugEnabled()) {
4787            LOG.debug("Block (=" + lastblock + ") not found");
4788          }
4789          return;
4790        } else {
4791          throw new IOException("Block (=" + lastblock + ") not found");
4792        }
4793      }
4794      final long oldGenerationStamp = storedBlock.getGenerationStamp();
4795      final long oldNumBytes = storedBlock.getNumBytes();
4796      //
4797      // The implementation of delete operation (see @deleteInternal method)
4798      // first removes the file paths from namespace, and delays the removal
4799      // of blocks to later time for better performance. When
4800      // commitBlockSynchronization (this method) is called in between, the
4801      // blockCollection of storedBlock could have been assigned to null by
4802      // the delete operation, throw IOException here instead of NPE; if the
4803      // file path is already removed from namespace by the delete operation,
4804      // throw FileNotFoundException here, so not to proceed to the end of
4805      // this method to add a CloseOp to the edit log for an already deleted
4806      // file (See HDFS-6825).
4807      //
4808      BlockCollection blockCollection = storedBlock.getBlockCollection();
4809      if (blockCollection == null) {
4810        throw new IOException("The blockCollection of " + storedBlock
4811            + " is null, likely because the file owning this block was"
4812            + " deleted and the block removal is delayed");
4813      }
4814      INodeFile iFile = ((INode)blockCollection).asFile();
4815      if (isFileDeleted(iFile)) {
4816        throw new FileNotFoundException("File not found: "
4817            + iFile.getFullPathName() + ", likely due to delayed block"
4818            + " removal");
4819      }
4820      if (!iFile.isUnderConstruction() || storedBlock.isComplete()) {
4821        if (LOG.isDebugEnabled()) {
4822          LOG.debug("Unexpected block (=" + lastblock
4823                    + ") since the file (=" + iFile.getLocalName()
4824                    + ") is not under construction");
4825        }
4826        return;
4827      }
4828
4829      long recoveryId =
4830        ((BlockInfoUnderConstruction)storedBlock).getBlockRecoveryId();
4831      if(recoveryId != newgenerationstamp) {
4832        throw new IOException("The recovery id " + newgenerationstamp
4833                              + " does not match current recovery id "
4834                              + recoveryId + " for block " + lastblock); 
4835      }
4836
4837      if (deleteblock) {
4838        Block blockToDel = ExtendedBlock.getLocalBlock(lastblock);
4839        boolean remove = iFile.removeLastBlock(blockToDel);
4840        if (remove) {
4841          blockManager.removeBlockFromMap(storedBlock);
4842        }
4843      }
4844      else {
4845        // update last block
4846        storedBlock.setGenerationStamp(newgenerationstamp);
4847        storedBlock.setNumBytes(newlength);
4848
4849        // find the DatanodeDescriptor objects
4850        ArrayList<DatanodeDescriptor> trimmedTargets =
4851            new ArrayList<DatanodeDescriptor>(newtargets.length);
4852        ArrayList<String> trimmedStorages =
4853            new ArrayList<String>(newtargets.length);
4854        if (newtargets.length > 0) {
4855          for (int i = 0; i < newtargets.length; ++i) {
4856            // try to get targetNode
4857            DatanodeDescriptor targetNode =
4858                blockManager.getDatanodeManager().getDatanode(newtargets[i]);
4859            if (targetNode != null) {
4860              trimmedTargets.add(targetNode);
4861              trimmedStorages.add(newtargetstorages[i]);
4862            } else if (LOG.isDebugEnabled()) {
4863              LOG.debug("DatanodeDescriptor (=" + newtargets[i] + ") not found");
4864            }
4865          }
4866        }
4867        if ((closeFile) && !trimmedTargets.isEmpty()) {
4868          // the file is getting closed. Insert block locations into blockManager.
4869          // Otherwise fsck will report these blocks as MISSING, especially if the
4870          // blocksReceived from Datanodes take a long time to arrive.
4871          for (int i = 0; i < trimmedTargets.size(); i++) {
4872            DatanodeStorageInfo storageInfo =
4873                trimmedTargets.get(i).getStorageInfo(trimmedStorages.get(i));
4874            if (storageInfo != null) {
4875              storageInfo.addBlock(storedBlock);
4876            }
4877          }
4878        }
4879
4880        // add pipeline locations into the INodeUnderConstruction
4881        DatanodeStorageInfo[] trimmedStorageInfos =
4882            blockManager.getDatanodeManager().getDatanodeStorageInfos(
4883                trimmedTargets.toArray(new DatanodeID[trimmedTargets.size()]),
4884                trimmedStorages.toArray(new String[trimmedStorages.size()]));
4885        iFile.setLastBlock(storedBlock, trimmedStorageInfos);
4886        if (closeFile) {
4887          blockManager.markBlockReplicasAsCorrupt(storedBlock,
4888              oldGenerationStamp, oldNumBytes, trimmedStorageInfos);
4889        }
4890      }
4891
4892      if (closeFile) {
4893        src = closeFileCommitBlocks(iFile, storedBlock);
4894      } else {
4895        // If this commit does not want to close the file, persist blocks
4896        src = iFile.getFullPathName();
4897        persistBlocks(src, iFile, false);
4898      }
4899    } finally {
4900      writeUnlock();
4901    }
4902    getEditLog().logSync();
4903    if (closeFile) {
4904      LOG.info("commitBlockSynchronization(newblock=" + lastblock
4905          + ", file=" + src
4906          + ", newgenerationstamp=" + newgenerationstamp
4907          + ", newlength=" + newlength
4908          + ", newtargets=" + Arrays.asList(newtargets) + ") successful");
4909    } else {
4910      LOG.info("commitBlockSynchronization(" + lastblock + ") successful");
4911    }
4912  }
4913
4914  /**
4915   * @param pendingFile open file that needs to be closed
4916   * @param storedBlock last block
4917   * @return Path of the file that was closed.
4918   * @throws IOException on error
4919   */
4920  @VisibleForTesting
4921  String closeFileCommitBlocks(INodeFile pendingFile, BlockInfo storedBlock)
4922      throws IOException {
4923    String src = pendingFile.getFullPathName();
4924
4925    // commit the last block and complete it if it has minimum replicas
4926    commitOrCompleteLastBlock(pendingFile, storedBlock);
4927
4928    //remove lease, close file
4929    finalizeINodeFileUnderConstruction(src, pendingFile,
4930        Snapshot.findLatestSnapshot(pendingFile, Snapshot.CURRENT_STATE_ID));
4931
4932    return src;
4933  }
4934
4935  /**
4936   * Renew the lease(s) held by the given client
4937   */
4938  void renewLease(String holder) throws IOException {
4939    checkOperation(OperationCategory.WRITE);
4940    readLock();
4941    try {
4942      checkOperation(OperationCategory.WRITE);
4943      checkNameNodeSafeMode("Cannot renew lease for " + holder);
4944      leaseManager.renewLease(holder);
4945    } finally {
4946      readUnlock();
4947    }
4948  }
4949
4950  /**
4951   * Get a partial listing of the indicated directory
4952   *
4953   * @param src the directory name
4954   * @param startAfter the name to start after
4955   * @param needLocation if blockLocations need to be returned
4956   * @return a partial listing starting after startAfter
4957   * 
4958   * @throws AccessControlException if access is denied
4959   * @throws UnresolvedLinkException if symbolic link is encountered
4960   * @throws IOException if other I/O error occurred
4961   */
4962  DirectoryListing getListing(String src, byte[] startAfter,
4963      boolean needLocation) 
4964      throws AccessControlException, UnresolvedLinkException, IOException {
4965    try {
4966      return getListingInt(src, startAfter, needLocation);
4967    } catch (AccessControlException e) {
4968      logAuditEvent(false, "listStatus", src);
4969      throw e;
4970    }
4971  }
4972
4973  private DirectoryListing getListingInt(final String srcArg, byte[] startAfter,
4974      boolean needLocation)
4975    throws AccessControlException, UnresolvedLinkException, IOException {
4976    String src = srcArg;
4977    DirectoryListing dl;
4978    FSPermissionChecker pc = getPermissionChecker();
4979    checkOperation(OperationCategory.READ);
4980    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
4981    String startAfterString = new String(startAfter);
4982    readLock();
4983    try {
4984      checkOperation(OperationCategory.READ);
4985      src = resolvePath(src, pathComponents);
4986
4987      // Get file name when startAfter is an INodePath
4988      if (FSDirectory.isReservedName(startAfterString)) {
4989        byte[][] startAfterComponents = FSDirectory
4990            .getPathComponentsForReservedPath(startAfterString);
4991        try {
4992          String tmp = FSDirectory.resolvePath(src, startAfterComponents, dir);
4993          byte[][] regularPath = INode.getPathComponents(tmp);
4994          startAfter = regularPath[regularPath.length - 1];
4995        } catch (IOException e) {
4996          // Possibly the inode is deleted
4997          throw new DirectoryListingStartAfterNotFoundException(
4998              "Can't find startAfter " + startAfterString);
4999        }
5000      }
5001
5002      boolean isSuperUser = true;
5003      if (isPermissionEnabled) {
5004        if (dir.isDir(src)) {
5005          checkPathAccess(pc, src, FsAction.READ_EXECUTE);
5006        } else {
5007          checkTraverse(pc, src);
5008        }
5009        isSuperUser = pc.isSuperUser();
5010      }
5011      logAuditEvent(true, "listStatus", srcArg);
5012      dl = dir.getListing(src, startAfter, needLocation, isSuperUser);
5013    } finally {
5014      readUnlock();
5015    }
5016    return dl;
5017  }
5018
5019  /////////////////////////////////////////////////////////
5020  //
5021  // These methods are called by datanodes
5022  //
5023  /////////////////////////////////////////////////////////
5024  /**
5025   * Register Datanode.
5026   * <p>
5027   * The purpose of registration is to identify whether the new datanode
5028   * serves a new data storage, and will report new data block copies,
5029   * which the namenode was not aware of; or the datanode is a replacement
5030   * node for the data storage that was previously served by a different
5031   * or the same (in terms of host:port) datanode.
5032   * The data storages are distinguished by their storageIDs. When a new
5033   * data storage is reported the namenode issues a new unique storageID.
5034   * <p>
5035   * Finally, the namenode returns its namespaceID as the registrationID
5036   * for the datanodes. 
5037   * namespaceID is a persistent attribute of the name space.
5038   * The registrationID is checked every time the datanode is communicating
5039   * with the namenode. 
5040   * Datanodes with inappropriate registrationID are rejected.
5041   * If the namenode stops, and then restarts it can restore its 
5042   * namespaceID and will continue serving the datanodes that has previously
5043   * registered with the namenode without restarting the whole cluster.
5044   * 
5045   * @see org.apache.hadoop.hdfs.server.datanode.DataNode
5046   */
5047  void registerDatanode(DatanodeRegistration nodeReg) throws IOException {
5048    writeLock();
5049    try {
5050      getBlockManager().getDatanodeManager().registerDatanode(nodeReg);
5051      checkSafeMode();
5052    } finally {
5053      writeUnlock();
5054    }
5055  }
5056  
5057  /**
5058   * Get registrationID for datanodes based on the namespaceID.
5059   * 
5060   * @see #registerDatanode(DatanodeRegistration)
5061   * @return registration ID
5062   */
5063  String getRegistrationID() {
5064    return Storage.getRegistrationID(getFSImage().getStorage());
5065  }
5066
5067  /**
5068   * The given node has reported in.  This method should:
5069   * 1) Record the heartbeat, so the datanode isn't timed out
5070   * 2) Adjust usage stats for future block allocation
5071   * 
5072   * If a substantial amount of time passed since the last datanode 
5073   * heartbeat then request an immediate block report.  
5074   * 
5075   * @return an array of datanode commands 
5076   * @throws IOException
5077   */
5078  HeartbeatResponse handleHeartbeat(DatanodeRegistration nodeReg,
5079      StorageReport[] reports, long cacheCapacity, long cacheUsed,
5080      int xceiverCount, int xmitsInProgress, int failedVolumes)
5081        throws IOException {
5082    readLock();
5083    try {
5084      //get datanode commands
5085      final int maxTransfer = blockManager.getMaxReplicationStreams()
5086          - xmitsInProgress;
5087      DatanodeCommand[] cmds = blockManager.getDatanodeManager().handleHeartbeat(
5088          nodeReg, reports, blockPoolId, cacheCapacity, cacheUsed,
5089          xceiverCount, maxTransfer, failedVolumes);
5090      
5091      //create ha status
5092      final NNHAStatusHeartbeat haState = new NNHAStatusHeartbeat(
5093          haContext.getState().getServiceState(),
5094          getFSImage().getLastAppliedOrWrittenTxId());
5095
5096      return new HeartbeatResponse(cmds, haState, rollingUpgradeInfo);
5097    } finally {
5098      readUnlock();
5099    }
5100  }
5101
5102  /**
5103   * Returns whether or not there were available resources at the last check of
5104   * resources.
5105   *
5106   * @return true if there were sufficient resources available, false otherwise.
5107   */
5108  boolean nameNodeHasResourcesAvailable() {
5109    return hasResourcesAvailable;
5110  }
5111
5112  /**
5113   * Perform resource checks and cache the results.
5114   */
5115  void checkAvailableResources() {
5116    Preconditions.checkState(nnResourceChecker != null,
5117        "nnResourceChecker not initialized");
5118    hasResourcesAvailable = nnResourceChecker.hasAvailableDiskSpace();
5119  }
5120
5121  /**
5122   * Persist the block list for the inode.
5123   * @param path
5124   * @param file
5125   * @param logRetryCache
5126   */
5127  private void persistBlocks(String path, INodeFile file,
5128                             boolean logRetryCache) {
5129    assert hasWriteLock();
5130    Preconditions.checkArgument(file.isUnderConstruction());
5131    getEditLog().logUpdateBlocks(path, file, logRetryCache);
5132    if(NameNode.stateChangeLog.isDebugEnabled()) {
5133      NameNode.stateChangeLog.debug("persistBlocks: " + path
5134              + " with " + file.getBlocks().length + " blocks is persisted to" +
5135              " the file system");
5136    }
5137  }
5138
5139  void incrDeletedFileCount(long count) {
5140    NameNode.getNameNodeMetrics().incrFilesDeleted(count);
5141  }
5142
5143  /**
5144   * Close file.
5145   * @param path
5146   * @param file
5147   */
5148  private void closeFile(String path, INodeFile file) {
5149    assert hasWriteLock();
5150    waitForLoadingFSImage();
5151    // file is closed
5152    getEditLog().logCloseFile(path, file);
5153    if (NameNode.stateChangeLog.isDebugEnabled()) {
5154      NameNode.stateChangeLog.debug("closeFile: "
5155              +path+" with "+ file.getBlocks().length
5156              +" blocks is persisted to the file system");
5157    }
5158  }
5159
5160  /**
5161   * Add the given symbolic link to the fs. Record it in the edits log.
5162   */
5163  private INodeSymlink addSymlink(String path, String target,
5164                                  PermissionStatus dirPerms,
5165                                  boolean createParent, boolean logRetryCache)
5166      throws UnresolvedLinkException, FileAlreadyExistsException,
5167      QuotaExceededException, SnapshotAccessControlException, AclException {
5168    waitForLoadingFSImage();
5169
5170    final long modTime = now();
5171    if (createParent) {
5172      final String parent = new Path(path).getParent().toString();
5173      if (!mkdirsRecursively(parent, dirPerms, true, modTime)) {
5174        return null;
5175      }
5176    }
5177    final String userName = dirPerms.getUserName();
5178    long id = allocateNewInodeId();
5179    INodeSymlink newNode = dir.addSymlink(id, path, target, modTime, modTime,
5180            new PermissionStatus(userName, null, FsPermission.getDefault()));
5181    if (newNode == null) {
5182      NameNode.stateChangeLog.info("addSymlink: failed to add " + path);
5183      return null;
5184    }
5185    getEditLog().logSymlink(path, target, modTime, modTime, newNode,
5186        logRetryCache);
5187
5188    if(NameNode.stateChangeLog.isDebugEnabled()) {
5189      NameNode.stateChangeLog.debug("addSymlink: " + path + " is added");
5190    }
5191    return newNode;
5192  }
5193
5194  /**
5195   * Periodically calls hasAvailableResources of NameNodeResourceChecker, and if
5196   * there are found to be insufficient resources available, causes the NN to
5197   * enter safe mode. If resources are later found to have returned to
5198   * acceptable levels, this daemon will cause the NN to exit safe mode.
5199   */
5200  class NameNodeResourceMonitor implements Runnable  {
5201    boolean shouldNNRmRun = true;
5202    @Override
5203    public void run () {
5204      try {
5205        while (fsRunning && shouldNNRmRun) {
5206          checkAvailableResources();
5207          if(!nameNodeHasResourcesAvailable()) {
5208            String lowResourcesMsg = "NameNode low on available disk space. ";
5209            if (!isInSafeMode()) {
5210              FSNamesystem.LOG.warn(lowResourcesMsg + "Entering safe mode.");
5211            } else {
5212              FSNamesystem.LOG.warn(lowResourcesMsg + "Already in safe mode.");
5213            }
5214            enterSafeMode(true);
5215          }
5216          try {
5217            Thread.sleep(resourceRecheckInterval);
5218          } catch (InterruptedException ie) {
5219            // Deliberately ignore
5220          }
5221        }
5222      } catch (Exception e) {
5223        FSNamesystem.LOG.error("Exception in NameNodeResourceMonitor: ", e);
5224      }
5225    }
5226
5227    public void stopMonitor() {
5228      shouldNNRmRun = false;
5229    }
5230 }
5231
5232  class NameNodeEditLogRoller implements Runnable {
5233
5234    private boolean shouldRun = true;
5235    private final long rollThreshold;
5236    private final long sleepIntervalMs;
5237
5238    public NameNodeEditLogRoller(long rollThreshold, int sleepIntervalMs) {
5239        this.rollThreshold = rollThreshold;
5240        this.sleepIntervalMs = sleepIntervalMs;
5241    }
5242
5243    @Override
5244    public void run() {
5245      while (fsRunning && shouldRun) {
5246        try {
5247          FSEditLog editLog = getFSImage().getEditLog();
5248          long numEdits =
5249              editLog.getLastWrittenTxId() - editLog.getCurSegmentTxId();
5250          if (numEdits > rollThreshold) {
5251            FSNamesystem.LOG.info("NameNode rolling its own edit log because"
5252                + " number of edits in open segment exceeds threshold of "
5253                + rollThreshold);
5254            rollEditLog();
5255          }
5256        } catch (Exception e) {
5257          FSNamesystem.LOG.error("Swallowing exception in "
5258              + NameNodeEditLogRoller.class.getSimpleName() + ":", e);
5259        }
5260        try {
5261          Thread.sleep(sleepIntervalMs);
5262        } catch (InterruptedException e) {
5263          FSNamesystem.LOG.info(NameNodeEditLogRoller.class.getSimpleName()
5264              + " was interrupted, exiting");
5265          break;
5266        }
5267      }
5268    }
5269
5270    public void stop() {
5271      shouldRun = false;
5272    }
5273  }
5274
5275  /**
5276   * Daemon to periodically scan the namespace for lazyPersist files
5277   * with missing blocks and unlink them.
5278   */
5279  class LazyPersistFileScrubber implements Runnable {
5280    private volatile boolean shouldRun = true;
5281    final int scrubIntervalSec;
5282    public LazyPersistFileScrubber(final int scrubIntervalSec) {
5283      this.scrubIntervalSec = scrubIntervalSec;
5284    }
5285
5286    /**
5287     * Periodically go over the list of lazyPersist files with missing
5288     * blocks and unlink them from the namespace.
5289     */
5290    private void clearCorruptLazyPersistFiles()
5291        throws SafeModeException, AccessControlException,
5292        UnresolvedLinkException, IOException {
5293
5294      BlockStoragePolicy lpPolicy = blockManager.getStoragePolicy("LAZY_PERSIST");
5295
5296      List<BlockCollection> filesToDelete = new ArrayList<BlockCollection>();
5297
5298      writeLock();
5299
5300      try {
5301        final Iterator<Block> it = blockManager.getCorruptReplicaBlockIterator();
5302
5303        while (it.hasNext()) {
5304          Block b = it.next();
5305          BlockInfo blockInfo = blockManager.getStoredBlock(b);
5306          if (blockInfo.getBlockCollection().getStoragePolicyID() == lpPolicy.getId()) {
5307            filesToDelete.add(blockInfo.getBlockCollection());
5308          }
5309        }
5310
5311        for (BlockCollection bc : filesToDelete) {
5312          LOG.warn("Removing lazyPersist file " + bc.getName() + " with no replicas.");
5313          deleteInternal(bc.getName(), false, false, false);
5314        }
5315      } finally {
5316        writeUnlock();
5317      }
5318    }
5319
5320    @Override
5321    public void run() {
5322      while (fsRunning && shouldRun) {
5323        try {
5324          clearCorruptLazyPersistFiles();
5325          Thread.sleep(scrubIntervalSec * 1000);
5326        } catch (InterruptedException e) {
5327          FSNamesystem.LOG.info(
5328              "LazyPersistFileScrubber was interrupted, exiting");
5329          break;
5330        } catch (Exception e) {
5331          FSNamesystem.LOG.error(
5332              "Ignoring exception in LazyPersistFileScrubber:", e);
5333        }
5334      }
5335    }
5336
5337    public void stop() {
5338      shouldRun = false;
5339    }
5340  }
5341
5342  public FSImage getFSImage() {
5343    return fsImage;
5344  }
5345
5346  public FSEditLog getEditLog() {
5347    return getFSImage().getEditLog();
5348  }    
5349
5350  private void checkBlock(ExtendedBlock block) throws IOException {
5351    if (block != null && !this.blockPoolId.equals(block.getBlockPoolId())) {
5352      throw new IOException("Unexpected BlockPoolId " + block.getBlockPoolId()
5353          + " - expected " + blockPoolId);
5354    }
5355  }
5356
5357  @Metric({"MissingBlocks", "Number of missing blocks"})
5358  public long getMissingBlocksCount() {
5359    // not locking
5360    return blockManager.getMissingBlocksCount();
5361  }
5362  
5363  @Metric({"ExpiredHeartbeats", "Number of expired heartbeats"})
5364  public int getExpiredHeartbeats() {
5365    return datanodeStatistics.getExpiredHeartbeats();
5366  }
5367  
5368  @Metric({"TransactionsSinceLastCheckpoint",
5369      "Number of transactions since last checkpoint"})
5370  public long getTransactionsSinceLastCheckpoint() {
5371    return getEditLog().getLastWrittenTxId() -
5372        getFSImage().getStorage().getMostRecentCheckpointTxId();
5373  }
5374  
5375  @Metric({"TransactionsSinceLastLogRoll",
5376      "Number of transactions since last edit log roll"})
5377  public long getTransactionsSinceLastLogRoll() {
5378    if (isInStandbyState() || !getEditLog().isSegmentOpen()) {
5379      return 0;
5380    } else {
5381      return getEditLog().getLastWrittenTxId() -
5382        getEditLog().getCurSegmentTxId() + 1;
5383    }
5384  }
5385  
5386  @Metric({"LastWrittenTransactionId", "Transaction ID written to the edit log"})
5387  public long getLastWrittenTransactionId() {
5388    return getEditLog().getLastWrittenTxId();
5389  }
5390  
5391  @Metric({"LastCheckpointTime",
5392      "Time in milliseconds since the epoch of the last checkpoint"})
5393  public long getLastCheckpointTime() {
5394    return getFSImage().getStorage().getMostRecentCheckpointTime();
5395  }
5396
5397  /** @see ClientProtocol#getStats() */
5398  long[] getStats() {
5399    final long[] stats = datanodeStatistics.getStats();
5400    stats[ClientProtocol.GET_STATS_UNDER_REPLICATED_IDX] = getUnderReplicatedBlocks();
5401    stats[ClientProtocol.GET_STATS_CORRUPT_BLOCKS_IDX] = getCorruptReplicaBlocks();
5402    stats[ClientProtocol.GET_STATS_MISSING_BLOCKS_IDX] = getMissingBlocksCount();
5403    return stats;
5404  }
5405
5406  @Override // FSNamesystemMBean
5407  @Metric({"CapacityTotal",
5408      "Total raw capacity of data nodes in bytes"})
5409  public long getCapacityTotal() {
5410    return datanodeStatistics.getCapacityTotal();
5411  }
5412
5413  @Metric({"CapacityTotalGB",
5414      "Total raw capacity of data nodes in GB"})
5415  public float getCapacityTotalGB() {
5416    return DFSUtil.roundBytesToGB(getCapacityTotal());
5417  }
5418
5419  @Override // FSNamesystemMBean
5420  @Metric({"CapacityUsed",
5421      "Total used capacity across all data nodes in bytes"})
5422  public long getCapacityUsed() {
5423    return datanodeStatistics.getCapacityUsed();
5424  }
5425
5426  @Metric({"CapacityUsedGB",
5427      "Total used capacity across all data nodes in GB"})
5428  public float getCapacityUsedGB() {
5429    return DFSUtil.roundBytesToGB(getCapacityUsed());
5430  }
5431
5432  @Override // FSNamesystemMBean
5433  @Metric({"CapacityRemaining", "Remaining capacity in bytes"})
5434  public long getCapacityRemaining() {
5435    return datanodeStatistics.getCapacityRemaining();
5436  }
5437
5438  @Metric({"CapacityRemainingGB", "Remaining capacity in GB"})
5439  public float getCapacityRemainingGB() {
5440    return DFSUtil.roundBytesToGB(getCapacityRemaining());
5441  }
5442
5443  @Metric({"CapacityUsedNonDFS",
5444      "Total space used by data nodes for non DFS purposes in bytes"})
5445  public long getCapacityUsedNonDFS() {
5446    return datanodeStatistics.getCapacityUsedNonDFS();
5447  }
5448
5449  /**
5450   * Total number of connections.
5451   */
5452  @Override // FSNamesystemMBean
5453  @Metric
5454  public int getTotalLoad() {
5455    return datanodeStatistics.getXceiverCount();
5456  }
5457  
5458  @Metric({ "SnapshottableDirectories", "Number of snapshottable directories" })
5459  public int getNumSnapshottableDirs() {
5460    return this.snapshotManager.getNumSnapshottableDirs();
5461  }
5462
5463  @Metric({ "Snapshots", "The number of snapshots" })
5464  public int getNumSnapshots() {
5465    return this.snapshotManager.getNumSnapshots();
5466  }
5467
5468  @Override
5469  public String getSnapshotStats() {
5470    Map<String, Object> info = new HashMap<String, Object>();
5471    info.put("SnapshottableDirectories", this.getNumSnapshottableDirs());
5472    info.put("Snapshots", this.getNumSnapshots());
5473    return JSON.toString(info);
5474  }
5475
5476  int getNumberOfDatanodes(DatanodeReportType type) {
5477    readLock();
5478    try {
5479      return getBlockManager().getDatanodeManager().getDatanodeListForReport(
5480          type).size(); 
5481    } finally {
5482      readUnlock();
5483    }
5484  }
5485
5486  DatanodeInfo[] datanodeReport(final DatanodeReportType type
5487      ) throws AccessControlException, StandbyException {
5488    checkSuperuserPrivilege();
5489    checkOperation(OperationCategory.UNCHECKED);
5490    readLock();
5491    try {
5492      checkOperation(OperationCategory.UNCHECKED);
5493      final DatanodeManager dm = getBlockManager().getDatanodeManager();      
5494      final List<DatanodeDescriptor> results = dm.getDatanodeListForReport(type);
5495
5496      DatanodeInfo[] arr = new DatanodeInfo[results.size()];
5497      for (int i=0; i<arr.length; i++) {
5498        arr[i] = new DatanodeInfo(results.get(i));
5499      }
5500      return arr;
5501    } finally {
5502      readUnlock();
5503    }
5504  }
5505
5506  DatanodeStorageReport[] getDatanodeStorageReport(final DatanodeReportType type
5507      ) throws AccessControlException, StandbyException {
5508    checkSuperuserPrivilege();
5509    checkOperation(OperationCategory.UNCHECKED);
5510    readLock();
5511    try {
5512      checkOperation(OperationCategory.UNCHECKED);
5513      final DatanodeManager dm = getBlockManager().getDatanodeManager();      
5514      final List<DatanodeDescriptor> datanodes = dm.getDatanodeListForReport(type);
5515
5516      DatanodeStorageReport[] reports = new DatanodeStorageReport[datanodes.size()];
5517      for (int i = 0; i < reports.length; i++) {
5518        final DatanodeDescriptor d = datanodes.get(i);
5519        reports[i] = new DatanodeStorageReport(new DatanodeInfo(d),
5520            d.getStorageReports());
5521      }
5522      return reports;
5523    } finally {
5524      readUnlock();
5525    }
5526  }
5527
5528  /**
5529   * Save namespace image.
5530   * This will save current namespace into fsimage file and empty edits file.
5531   * Requires superuser privilege and safe mode.
5532   * 
5533   * @throws AccessControlException if superuser privilege is violated.
5534   * @throws IOException if 
5535   */
5536  void saveNamespace() throws AccessControlException, IOException {
5537    checkOperation(OperationCategory.UNCHECKED);
5538    checkSuperuserPrivilege();
5539    
5540    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
5541    if (cacheEntry != null && cacheEntry.isSuccess()) {
5542      return; // Return previous response
5543    }
5544    boolean success = false;
5545    readLock();
5546    try {
5547      checkOperation(OperationCategory.UNCHECKED);
5548
5549      if (!isInSafeMode()) {
5550        throw new IOException("Safe mode should be turned ON "
5551            + "in order to create namespace image.");
5552      }
5553      getFSImage().saveNamespace(this);
5554      success = true;
5555    } finally {
5556      readUnlock();
5557      RetryCache.setState(cacheEntry, success);
5558    }
5559    LOG.info("New namespace image has been created");
5560  }
5561  
5562  /**
5563   * Enables/Disables/Checks restoring failed storage replicas if the storage becomes available again.
5564   * Requires superuser privilege.
5565   * 
5566   * @throws AccessControlException if superuser privilege is violated.
5567   */
5568  boolean restoreFailedStorage(String arg) throws AccessControlException,
5569      StandbyException {
5570    checkSuperuserPrivilege();
5571    checkOperation(OperationCategory.UNCHECKED);
5572    writeLock();
5573    try {
5574      checkOperation(OperationCategory.UNCHECKED);
5575      
5576      // if it is disabled - enable it and vice versa.
5577      if(arg.equals("check"))
5578        return getFSImage().getStorage().getRestoreFailedStorage();
5579      
5580      boolean val = arg.equals("true");  // false if not
5581      getFSImage().getStorage().setRestoreFailedStorage(val);
5582      
5583      return val;
5584    } finally {
5585      writeUnlock();
5586    }
5587  }
5588
5589  Date getStartTime() {
5590    return new Date(startTime); 
5591  }
5592    
5593  void finalizeUpgrade() throws IOException {
5594    checkSuperuserPrivilege();
5595    checkOperation(OperationCategory.UNCHECKED);
5596    writeLock();
5597    try {
5598      checkOperation(OperationCategory.UNCHECKED);
5599      getFSImage().finalizeUpgrade(this.isHaEnabled() && inActiveState());
5600    } finally {
5601      writeUnlock();
5602    }
5603  }
5604
5605  void refreshNodes() throws IOException {
5606    checkOperation(OperationCategory.UNCHECKED);
5607    checkSuperuserPrivilege();
5608    getBlockManager().getDatanodeManager().refreshNodes(new HdfsConfiguration());
5609  }
5610
5611  void setBalancerBandwidth(long bandwidth) throws IOException {
5612    checkOperation(OperationCategory.UNCHECKED);
5613    checkSuperuserPrivilege();
5614    getBlockManager().getDatanodeManager().setBalancerBandwidth(bandwidth);
5615  }
5616
5617  /**
5618   * Persist the new block (the last block of the given file).
5619   * @param path
5620   * @param file
5621   */
5622  private void persistNewBlock(String path, INodeFile file) {
5623    Preconditions.checkArgument(file.isUnderConstruction());
5624    getEditLog().logAddBlock(path, file);
5625    if (NameNode.stateChangeLog.isDebugEnabled()) {
5626      NameNode.stateChangeLog.debug("persistNewBlock: "
5627              + path + " with new block " + file.getLastBlock().toString()
5628              + ", current total block count is " + file.getBlocks().length);
5629    }
5630  }
5631
5632  /**
5633   * SafeModeInfo contains information related to the safe mode.
5634   * <p>
5635   * An instance of {@link SafeModeInfo} is created when the name node
5636   * enters safe mode.
5637   * <p>
5638   * During name node startup {@link SafeModeInfo} counts the number of
5639   * <em>safe blocks</em>, those that have at least the minimal number of
5640   * replicas, and calculates the ratio of safe blocks to the total number
5641   * of blocks in the system, which is the size of blocks in
5642   * {@link FSNamesystem#blockManager}. When the ratio reaches the
5643   * {@link #threshold} it starts the SafeModeMonitor daemon in order
5644   * to monitor whether the safe mode {@link #extension} is passed.
5645   * Then it leaves safe mode and destroys itself.
5646   * <p>
5647   * If safe mode is turned on manually then the number of safe blocks is
5648   * not tracked because the name node is not intended to leave safe mode
5649   * automatically in the case.
5650   *
5651   * @see ClientProtocol#setSafeMode(HdfsConstants.SafeModeAction, boolean)
5652   */
5653  public class SafeModeInfo {
5654    // configuration fields
5655    /** Safe mode threshold condition %.*/
5656    private final double threshold;
5657    /** Safe mode minimum number of datanodes alive */
5658    private final int datanodeThreshold;
5659    /**
5660     * Safe mode extension after the threshold.
5661     * Make it volatile so that getSafeModeTip can read the latest value
5662     * without taking a lock.
5663     */
5664    private volatile int extension;
5665    /** Min replication required by safe mode. */
5666    private final int safeReplication;
5667    /** threshold for populating needed replication queues */
5668    private final double replQueueThreshold;
5669    // internal fields
5670    /** Time when threshold was reached.
5671     * <br> -1 safe mode is off
5672     * <br> 0 safe mode is on, and threshold is not reached yet
5673     * <br> >0 safe mode is on, but we are in extension period 
5674     */
5675    private long reached = -1;  
5676    /** Total number of blocks. */
5677    int blockTotal; 
5678    /** Number of safe blocks. */
5679    int blockSafe;
5680    /** Number of blocks needed to satisfy safe mode threshold condition */
5681    private int blockThreshold;
5682    /** Number of blocks needed before populating replication queues */
5683    private int blockReplQueueThreshold;
5684    /** time of the last status printout */
5685    private long lastStatusReport = 0;
5686    /**
5687     * Was safemode entered automatically because available resources were low.
5688     * Make it volatile so that getSafeModeTip can read the latest value
5689     * without taking a lock.
5690     */
5691    private volatile boolean resourcesLow = false;
5692    /** Should safemode adjust its block totals as blocks come in */
5693    private boolean shouldIncrementallyTrackBlocks = false;
5694    /** counter for tracking startup progress of reported blocks */
5695    private Counter awaitingReportedBlocksCounter;
5696    
5697    /**
5698     * Creates SafeModeInfo when the name node enters
5699     * automatic safe mode at startup.
5700     *  
5701     * @param conf configuration
5702     */
5703    private SafeModeInfo(Configuration conf) {
5704      this.threshold = conf.getFloat(DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY,
5705          DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT);
5706      if(threshold > 1.0) {
5707        LOG.warn("The threshold value should't be greater than 1, threshold: " + threshold);
5708      }
5709      this.datanodeThreshold = conf.getInt(
5710        DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY,
5711        DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT);
5712      this.extension = conf.getInt(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 0);
5713      this.safeReplication = conf.getInt(DFS_NAMENODE_REPLICATION_MIN_KEY, 
5714                                         DFS_NAMENODE_REPLICATION_MIN_DEFAULT);
5715      
5716      LOG.info(DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY + " = " + threshold);
5717      LOG.info(DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY + " = " + datanodeThreshold);
5718      LOG.info(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY + "     = " + extension);
5719
5720      // default to safe mode threshold (i.e., don't populate queues before leaving safe mode)
5721      this.replQueueThreshold = 
5722        conf.getFloat(DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY,
5723                      (float) threshold);
5724      this.blockTotal = 0; 
5725      this.blockSafe = 0;
5726    }
5727
5728    /**
5729     * In the HA case, the StandbyNode can be in safemode while the namespace
5730     * is modified by the edit log tailer. In this case, the number of total
5731     * blocks changes as edits are processed (eg blocks are added and deleted).
5732     * However, we don't want to do the incremental tracking during the
5733     * startup-time loading process -- only once the initial total has been
5734     * set after the image has been loaded.
5735     */
5736    private boolean shouldIncrementallyTrackBlocks() {
5737      return shouldIncrementallyTrackBlocks;
5738    }
5739
5740    /**
5741     * Creates SafeModeInfo when safe mode is entered manually, or because
5742     * available resources are low.
5743     *
5744     * The {@link #threshold} is set to 1.5 so that it could never be reached.
5745     * {@link #blockTotal} is set to -1 to indicate that safe mode is manual.
5746     * 
5747     * @see SafeModeInfo
5748     */
5749    private SafeModeInfo(boolean resourcesLow) {
5750      this.threshold = 1.5f;  // this threshold can never be reached
5751      this.datanodeThreshold = Integer.MAX_VALUE;
5752      this.extension = Integer.MAX_VALUE;
5753      this.safeReplication = Short.MAX_VALUE + 1; // more than maxReplication
5754      this.replQueueThreshold = 1.5f; // can never be reached
5755      this.blockTotal = -1;
5756      this.blockSafe = -1;
5757      this.resourcesLow = resourcesLow;
5758      enter();
5759      reportStatus("STATE* Safe mode is ON.", true);
5760    }
5761      
5762    /**
5763     * Check if safe mode is on.
5764     * @return true if in safe mode
5765     */
5766    private synchronized boolean isOn() {
5767      doConsistencyCheck();
5768      return this.reached >= 0;
5769    }
5770      
5771    /**
5772     * Enter safe mode.
5773     */
5774    private void enter() {
5775      this.reached = 0;
5776    }
5777      
5778    /**
5779     * Leave safe mode.
5780     * <p>
5781     * Check for invalid, under- & over-replicated blocks in the end of startup.
5782     */
5783    private synchronized void leave() {
5784      // if not done yet, initialize replication queues.
5785      // In the standby, do not populate repl queues
5786      if (!isPopulatingReplQueues() && shouldPopulateReplQueues()) {
5787        initializeReplQueues();
5788      }
5789      long timeInSafemode = now() - startTime;
5790      NameNode.stateChangeLog.info("STATE* Leaving safe mode after " 
5791                                    + timeInSafemode/1000 + " secs");
5792      NameNode.getNameNodeMetrics().setSafeModeTime((int) timeInSafemode);
5793
5794      //Log the following only once (when transitioning from ON -> OFF)
5795      if (reached >= 0) {
5796        NameNode.stateChangeLog.info("STATE* Safe mode is OFF"); 
5797      }
5798      reached = -1;
5799      safeMode = null;
5800      final NetworkTopology nt = blockManager.getDatanodeManager().getNetworkTopology();
5801      NameNode.stateChangeLog.info("STATE* Network topology has "
5802          + nt.getNumOfRacks() + " racks and "
5803          + nt.getNumOfLeaves() + " datanodes");
5804      NameNode.stateChangeLog.info("STATE* UnderReplicatedBlocks has "
5805          + blockManager.numOfUnderReplicatedBlocks() + " blocks");
5806
5807      startSecretManagerIfNecessary();
5808
5809      // If startup has not yet completed, end safemode phase.
5810      StartupProgress prog = NameNode.getStartupProgress();
5811      if (prog.getStatus(Phase.SAFEMODE) != Status.COMPLETE) {
5812        prog.endStep(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS);
5813        prog.endPhase(Phase.SAFEMODE);
5814      }
5815    }
5816
5817    /**
5818     * Check whether we have reached the threshold for 
5819     * initializing replication queues.
5820     */
5821    private synchronized boolean canInitializeReplQueues() {
5822      return shouldPopulateReplQueues()
5823          && blockSafe >= blockReplQueueThreshold;
5824    }
5825      
5826    /** 
5827     * Safe mode can be turned off iff 
5828     * the threshold is reached and 
5829     * the extension time have passed.
5830     * @return true if can leave or false otherwise.
5831     */
5832    private synchronized boolean canLeave() {
5833      if (reached == 0) {
5834        return false;
5835      }
5836
5837      if (now() - reached < extension) {
5838        reportStatus("STATE* Safe mode ON, in safe mode extension.", false);
5839        return false;
5840      }
5841
5842      if (needEnter()) {
5843        reportStatus("STATE* Safe mode ON, thresholds not met.", false);
5844        return false;
5845      }
5846
5847      return true;
5848    }
5849      
5850    /** 
5851     * There is no need to enter safe mode 
5852     * if DFS is empty or {@link #threshold} == 0
5853     */
5854    private boolean needEnter() {
5855      return (threshold != 0 && blockSafe < blockThreshold) ||
5856        (datanodeThreshold != 0 && getNumLiveDataNodes() < datanodeThreshold) ||
5857        (!nameNodeHasResourcesAvailable());
5858    }
5859      
5860    /**
5861     * Check and trigger safe mode if needed. 
5862     */
5863    private void checkMode() {
5864      // Have to have write-lock since leaving safemode initializes
5865      // repl queues, which requires write lock
5866      assert hasWriteLock();
5867      if (inTransitionToActive()) {
5868        return;
5869      }
5870      // if smmthread is already running, the block threshold must have been 
5871      // reached before, there is no need to enter the safe mode again
5872      if (smmthread == null && needEnter()) {
5873        enter();
5874        // check if we are ready to initialize replication queues
5875        if (canInitializeReplQueues() && !isPopulatingReplQueues()
5876            && !haEnabled) {
5877          initializeReplQueues();
5878        }
5879        reportStatus("STATE* Safe mode ON.", false);
5880        return;
5881      }
5882      // the threshold is reached or was reached before
5883      if (!isOn() ||                           // safe mode is off
5884          extension <= 0 || threshold <= 0) {  // don't need to wait
5885        this.leave(); // leave safe mode
5886        return;
5887      }
5888      if (reached > 0) {  // threshold has already been reached before
5889        reportStatus("STATE* Safe mode ON.", false);
5890        return;
5891      }
5892      // start monitor
5893      reached = now();
5894      if (smmthread == null) {
5895        smmthread = new Daemon(new SafeModeMonitor());
5896        smmthread.start();
5897        reportStatus("STATE* Safe mode extension entered.", true);
5898      }
5899
5900      // check if we are ready to initialize replication queues
5901      if (canInitializeReplQueues() && !isPopulatingReplQueues() && !haEnabled) {
5902        initializeReplQueues();
5903      }
5904    }
5905      
5906    /**
5907     * Set total number of blocks.
5908     */
5909    private synchronized void setBlockTotal(int total) {
5910      this.blockTotal = total;
5911      this.blockThreshold = (int) (blockTotal * threshold);
5912      this.blockReplQueueThreshold = 
5913        (int) (blockTotal * replQueueThreshold);
5914      if (haEnabled) {
5915        // After we initialize the block count, any further namespace
5916        // modifications done while in safe mode need to keep track
5917        // of the number of total blocks in the system.
5918        this.shouldIncrementallyTrackBlocks = true;
5919      }
5920      if(blockSafe < 0)
5921        this.blockSafe = 0;
5922      checkMode();
5923    }
5924      
5925    /**
5926     * Increment number of safe blocks if current block has 
5927     * reached minimal replication.
5928     * @param replication current replication 
5929     */
5930    private synchronized void incrementSafeBlockCount(short replication) {
5931      if (replication == safeReplication) {
5932        this.blockSafe++;
5933
5934        // Report startup progress only if we haven't completed startup yet.
5935        StartupProgress prog = NameNode.getStartupProgress();
5936        if (prog.getStatus(Phase.SAFEMODE) != Status.COMPLETE) {
5937          if (this.awaitingReportedBlocksCounter == null) {
5938            this.awaitingReportedBlocksCounter = prog.getCounter(Phase.SAFEMODE,
5939              STEP_AWAITING_REPORTED_BLOCKS);
5940          }
5941          this.awaitingReportedBlocksCounter.increment();
5942        }
5943
5944        checkMode();
5945      }
5946    }
5947      
5948    /**
5949     * Decrement number of safe blocks if current block has 
5950     * fallen below minimal replication.
5951     * @param replication current replication 
5952     */
5953    private synchronized void decrementSafeBlockCount(short replication) {
5954      if (replication == safeReplication-1) {
5955        this.blockSafe--;
5956        //blockSafe is set to -1 in manual / low resources safemode
5957        assert blockSafe >= 0 || isManual() || areResourcesLow();
5958        checkMode();
5959      }
5960    }
5961
5962    /**
5963     * Check if safe mode was entered manually
5964     */
5965    private boolean isManual() {
5966      return extension == Integer.MAX_VALUE;
5967    }
5968
5969    /**
5970     * Set manual safe mode.
5971     */
5972    private synchronized void setManual() {
5973      extension = Integer.MAX_VALUE;
5974    }
5975
5976    /**
5977     * Check if safe mode was entered due to resources being low.
5978     */
5979    private boolean areResourcesLow() {
5980      return resourcesLow;
5981    }
5982
5983    /**
5984     * Set that resources are low for this instance of safe mode.
5985     */
5986    private void setResourcesLow() {
5987      resourcesLow = true;
5988    }
5989
5990    /**
5991     * A tip on how safe mode is to be turned off: manually or automatically.
5992     */
5993    String getTurnOffTip() {
5994      if(!isOn()) {
5995        return "Safe mode is OFF.";
5996      }
5997
5998      //Manual OR low-resource safemode. (Admin intervention required)
5999      String adminMsg = "It was turned on manually. ";
6000      if (areResourcesLow()) {
6001        adminMsg = "Resources are low on NN. Please add or free up more "
6002          + "resources then turn off safe mode manually. NOTE:  If you turn off"
6003          + " safe mode before adding resources, "
6004          + "the NN will immediately return to safe mode. ";
6005      }
6006      if (isManual() || areResourcesLow()) {
6007        return adminMsg
6008          + "Use \"hdfs dfsadmin -safemode leave\" to turn safe mode off.";
6009      }
6010
6011      boolean thresholdsMet = true;
6012      int numLive = getNumLiveDataNodes();
6013      String msg = "";
6014      if (blockSafe < blockThreshold) {
6015        msg += String.format(
6016          "The reported blocks %d needs additional %d"
6017          + " blocks to reach the threshold %.4f of total blocks %d.%n",
6018          blockSafe, (blockThreshold - blockSafe) + 1, threshold, blockTotal);
6019        thresholdsMet = false;
6020      } else {
6021        msg += String.format("The reported blocks %d has reached the threshold"
6022            + " %.4f of total blocks %d. ", blockSafe, threshold, blockTotal);
6023      }
6024      if (numLive < datanodeThreshold) {
6025        msg += String.format(
6026          "The number of live datanodes %d needs an additional %d live "
6027          + "datanodes to reach the minimum number %d.%n",
6028          numLive, (datanodeThreshold - numLive), datanodeThreshold);
6029        thresholdsMet = false;
6030      } else {
6031        msg += String.format("The number of live datanodes %d has reached "
6032            + "the minimum number %d. ",
6033            numLive, datanodeThreshold);
6034      }
6035      msg += (reached > 0) ? "In safe mode extension. " : "";
6036      msg += "Safe mode will be turned off automatically ";
6037
6038      if (!thresholdsMet) {
6039        msg += "once the thresholds have been reached.";
6040      } else if (reached + extension - now() > 0) {
6041        msg += ("in " + (reached + extension - now()) / 1000 + " seconds.");
6042      } else {
6043        msg += "soon.";
6044      }
6045
6046      return msg;
6047    }
6048
6049    /**
6050     * Print status every 20 seconds.
6051     */
6052    private void reportStatus(String msg, boolean rightNow) {
6053      long curTime = now();
6054      if(!rightNow && (curTime - lastStatusReport < 20 * 1000))
6055        return;
6056      NameNode.stateChangeLog.info(msg + " \n" + getTurnOffTip());
6057      lastStatusReport = curTime;
6058    }
6059
6060    @Override
6061    public String toString() {
6062      String resText = "Current safe blocks = " 
6063        + blockSafe 
6064        + ". Target blocks = " + blockThreshold + " for threshold = %" + threshold
6065        + ". Minimal replication = " + safeReplication + ".";
6066      if (reached > 0) 
6067        resText += " Threshold was reached " + new Date(reached) + ".";
6068      return resText;
6069    }
6070      
6071    /**
6072     * Checks consistency of the class state.
6073     * This is costly so only runs if asserts are enabled.
6074     */
6075    private void doConsistencyCheck() {
6076      boolean assertsOn = false;
6077      assert assertsOn = true; // set to true if asserts are on
6078      if (!assertsOn) return;
6079      
6080      if (blockTotal == -1 && blockSafe == -1) {
6081        return; // manual safe mode
6082      }
6083      int activeBlocks = blockManager.getActiveBlockCount();
6084      if ((blockTotal != activeBlocks) &&
6085          !(blockSafe >= 0 && blockSafe <= blockTotal)) {
6086        throw new AssertionError(
6087            " SafeMode: Inconsistent filesystem state: "
6088        + "SafeMode data: blockTotal=" + blockTotal
6089        + " blockSafe=" + blockSafe + "; "
6090        + "BlockManager data: active="  + activeBlocks);
6091      }
6092    }
6093
6094    private synchronized void adjustBlockTotals(int deltaSafe, int deltaTotal) {
6095      if (!shouldIncrementallyTrackBlocks) {
6096        return;
6097      }
6098      assert haEnabled;
6099      
6100      if (LOG.isDebugEnabled()) {
6101        LOG.debug("Adjusting block totals from " +
6102            blockSafe + "/" + blockTotal + " to " +
6103            (blockSafe + deltaSafe) + "/" + (blockTotal + deltaTotal));
6104      }
6105      assert blockSafe + deltaSafe >= 0 : "Can't reduce blockSafe " +
6106        blockSafe + " by " + deltaSafe + ": would be negative";
6107      assert blockTotal + deltaTotal >= 0 : "Can't reduce blockTotal " +
6108        blockTotal + " by " + deltaTotal + ": would be negative";
6109      
6110      blockSafe += deltaSafe;
6111      setBlockTotal(blockTotal + deltaTotal);
6112    }
6113  }
6114    
6115  /**
6116   * Periodically check whether it is time to leave safe mode.
6117   * This thread starts when the threshold level is reached.
6118   *
6119   */
6120  class SafeModeMonitor implements Runnable {
6121    /** interval in msec for checking safe mode: {@value} */
6122    private static final long recheckInterval = 1000;
6123      
6124    /**
6125     */
6126    @Override
6127    public void run() {
6128      while (fsRunning) {
6129        writeLock();
6130        try {
6131          if (safeMode == null) { // Not in safe mode.
6132            break;
6133          }
6134          if (safeMode.canLeave()) {
6135            // Leave safe mode.
6136            safeMode.leave();
6137            smmthread = null;
6138            break;
6139          }
6140        } finally {
6141          writeUnlock();
6142        }
6143
6144        try {
6145          Thread.sleep(recheckInterval);
6146        } catch (InterruptedException ie) {
6147          // Ignored
6148        }
6149      }
6150      if (!fsRunning) {
6151        LOG.info("NameNode is being shutdown, exit SafeModeMonitor thread");
6152      }
6153    }
6154  }
6155    
6156  boolean setSafeMode(SafeModeAction action) throws IOException {
6157    if (action != SafeModeAction.SAFEMODE_GET) {
6158      checkSuperuserPrivilege();
6159      switch(action) {
6160      case SAFEMODE_LEAVE: // leave safe mode
6161        leaveSafeMode();
6162        break;
6163      case SAFEMODE_ENTER: // enter safe mode
6164        enterSafeMode(false);
6165        break;
6166      default:
6167        LOG.error("Unexpected safe mode action");
6168      }
6169    }
6170    return isInSafeMode();
6171  }
6172
6173  @Override
6174  public void checkSafeMode() {
6175    // safeMode is volatile, and may be set to null at any time
6176    SafeModeInfo safeMode = this.safeMode;
6177    if (safeMode != null) {
6178      safeMode.checkMode();
6179    }
6180  }
6181
6182  @Override
6183  public boolean isInSafeMode() {
6184    // safeMode is volatile, and may be set to null at any time
6185    SafeModeInfo safeMode = this.safeMode;
6186    if (safeMode == null)
6187      return false;
6188    return safeMode.isOn();
6189  }
6190
6191  @Override
6192  public boolean isInStartupSafeMode() {
6193    // safeMode is volatile, and may be set to null at any time
6194    SafeModeInfo safeMode = this.safeMode;
6195    if (safeMode == null)
6196      return false;
6197    // If the NN is in safemode, and not due to manual / low resources, we
6198    // assume it must be because of startup. If the NN had low resources during
6199    // startup, we assume it came out of startup safemode and it is now in low
6200    // resources safemode
6201    return !safeMode.isManual() && !safeMode.areResourcesLow()
6202      && safeMode.isOn();
6203  }
6204
6205  /**
6206   * Check if replication queues are to be populated
6207   * @return true when node is HAState.Active and not in the very first safemode
6208   */
6209  @Override
6210  public boolean isPopulatingReplQueues() {
6211    if (!shouldPopulateReplQueues()) {
6212      return false;
6213    }
6214    return initializedReplQueues;
6215  }
6216
6217  private boolean shouldPopulateReplQueues() {
6218    if(haContext == null || haContext.getState() == null)
6219      return false;
6220    return haContext.getState().shouldPopulateReplQueues();
6221  }
6222
6223  @Override
6224  public void incrementSafeBlockCount(int replication) {
6225    // safeMode is volatile, and may be set to null at any time
6226    SafeModeInfo safeMode = this.safeMode;
6227    if (safeMode == null)
6228      return;
6229    safeMode.incrementSafeBlockCount((short)replication);
6230  }
6231
6232  @Override
6233  public void decrementSafeBlockCount(Block b) {
6234    // safeMode is volatile, and may be set to null at any time
6235    SafeModeInfo safeMode = this.safeMode;
6236    if (safeMode == null) // mostly true
6237      return;
6238    BlockInfo storedBlock = getStoredBlock(b);
6239    if (storedBlock.isComplete()) {
6240      safeMode.decrementSafeBlockCount((short)blockManager.countNodes(b).liveReplicas());
6241    }
6242  }
6243  
6244  /**
6245   * Adjust the total number of blocks safe and expected during safe mode.
6246   * If safe mode is not currently on, this is a no-op.
6247   * @param deltaSafe the change in number of safe blocks
6248   * @param deltaTotal the change i nnumber of total blocks expected
6249   */
6250  @Override
6251  public void adjustSafeModeBlockTotals(int deltaSafe, int deltaTotal) {
6252    // safeMode is volatile, and may be set to null at any time
6253    SafeModeInfo safeMode = this.safeMode;
6254    if (safeMode == null)
6255      return;
6256    safeMode.adjustBlockTotals(deltaSafe, deltaTotal);
6257  }
6258
6259  /**
6260   * Set the total number of blocks in the system. 
6261   */
6262  public void setBlockTotal() {
6263    // safeMode is volatile, and may be set to null at any time
6264    SafeModeInfo safeMode = this.safeMode;
6265    if (safeMode == null)
6266      return;
6267    safeMode.setBlockTotal((int)getCompleteBlocksTotal());
6268  }
6269
6270  /**
6271   * Get the total number of blocks in the system. 
6272   */
6273  @Override // FSNamesystemMBean
6274  @Metric
6275  public long getBlocksTotal() {
6276    return blockManager.getTotalBlocks();
6277  }
6278
6279  /**
6280   * Get the total number of COMPLETE blocks in the system.
6281   * For safe mode only complete blocks are counted.
6282   */
6283  private long getCompleteBlocksTotal() {
6284    // Calculate number of blocks under construction
6285    long numUCBlocks = 0;
6286    readLock();
6287    numUCBlocks = leaseManager.getNumUnderConstructionBlocks();
6288    try {
6289      return getBlocksTotal() - numUCBlocks;
6290    } finally {
6291      readUnlock();
6292    }
6293  }
6294
6295  /**
6296   * Enter safe mode. If resourcesLow is false, then we assume it is manual
6297   * @throws IOException
6298   */
6299  void enterSafeMode(boolean resourcesLow) throws IOException {
6300    writeLock();
6301    try {
6302      // Stop the secret manager, since rolling the master key would
6303      // try to write to the edit log
6304      stopSecretManager();
6305
6306      // Ensure that any concurrent operations have been fully synced
6307      // before entering safe mode. This ensures that the FSImage
6308      // is entirely stable on disk as soon as we're in safe mode.
6309      boolean isEditlogOpenForWrite = getEditLog().isOpenForWrite();
6310      // Before Editlog is in OpenForWrite mode, editLogStream will be null. So,
6311      // logSyncAll call can be called only when Edlitlog is in OpenForWrite mode
6312      if (isEditlogOpenForWrite) {
6313        getEditLog().logSyncAll();
6314      }
6315      if (!isInSafeMode()) {
6316        safeMode = new SafeModeInfo(resourcesLow);
6317        return;
6318      }
6319      if (resourcesLow) {
6320        safeMode.setResourcesLow();
6321      } else {
6322        safeMode.setManual();
6323      }
6324      if (isEditlogOpenForWrite) {
6325        getEditLog().logSyncAll();
6326      }
6327      NameNode.stateChangeLog.info("STATE* Safe mode is ON"
6328          + safeMode.getTurnOffTip());
6329    } finally {
6330      writeUnlock();
6331    }
6332  }
6333
6334  /**
6335   * Leave safe mode.
6336   */
6337  void leaveSafeMode() {
6338    writeLock();
6339    try {
6340      if (!isInSafeMode()) {
6341        NameNode.stateChangeLog.info("STATE* Safe mode is already OFF"); 
6342        return;
6343      }
6344      safeMode.leave();
6345    } finally {
6346      writeUnlock();
6347    }
6348  }
6349    
6350  String getSafeModeTip() {
6351    // There is no need to take readLock.
6352    // Don't use isInSafeMode as this.safeMode might be set to null.
6353    // after isInSafeMode returns.
6354    boolean inSafeMode;
6355    SafeModeInfo safeMode = this.safeMode;
6356    if (safeMode == null) {
6357      inSafeMode = false;
6358    } else {
6359      inSafeMode = safeMode.isOn();
6360    }
6361
6362    if (!inSafeMode) {
6363      return "";
6364    } else {
6365      return safeMode.getTurnOffTip();
6366    }
6367  }
6368
6369  CheckpointSignature rollEditLog() throws IOException {
6370    checkSuperuserPrivilege();
6371    checkOperation(OperationCategory.JOURNAL);
6372    writeLock();
6373    try {
6374      checkOperation(OperationCategory.JOURNAL);
6375      checkNameNodeSafeMode("Log not rolled");
6376      if (Server.isRpcInvocation()) {
6377        LOG.info("Roll Edit Log from " + Server.getRemoteAddress());
6378      }
6379      return getFSImage().rollEditLog();
6380    } finally {
6381      writeUnlock();
6382    }
6383  }
6384
6385  NamenodeCommand startCheckpoint(NamenodeRegistration backupNode,
6386      NamenodeRegistration activeNamenode) throws IOException {
6387    checkOperation(OperationCategory.CHECKPOINT);
6388    CacheEntryWithPayload cacheEntry = RetryCache.waitForCompletion(retryCache,
6389        null);
6390    if (cacheEntry != null && cacheEntry.isSuccess()) {
6391      return (NamenodeCommand) cacheEntry.getPayload();
6392    }
6393    writeLock();
6394    NamenodeCommand cmd = null;
6395    try {
6396      checkOperation(OperationCategory.CHECKPOINT);
6397      checkNameNodeSafeMode("Checkpoint not started");
6398      
6399      LOG.info("Start checkpoint for " + backupNode.getAddress());
6400      cmd = getFSImage().startCheckpoint(backupNode, activeNamenode);
6401      getEditLog().logSync();
6402      return cmd;
6403    } finally {
6404      writeUnlock();
6405      RetryCache.setState(cacheEntry, cmd != null, cmd);
6406    }
6407  }
6408
6409  public void processIncrementalBlockReport(final DatanodeID nodeID,
6410      final StorageReceivedDeletedBlocks srdb)
6411      throws IOException {
6412    writeLock();
6413    try {
6414      blockManager.processIncrementalBlockReport(nodeID, srdb);
6415    } finally {
6416      writeUnlock();
6417    }
6418  }
6419  
6420  void endCheckpoint(NamenodeRegistration registration,
6421                            CheckpointSignature sig) throws IOException {
6422    checkOperation(OperationCategory.CHECKPOINT);
6423    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
6424    if (cacheEntry != null && cacheEntry.isSuccess()) {
6425      return; // Return previous response
6426    }
6427    boolean success = false;
6428    readLock();
6429    try {
6430      checkOperation(OperationCategory.CHECKPOINT);
6431
6432      checkNameNodeSafeMode("Checkpoint not ended");
6433      LOG.info("End checkpoint for " + registration.getAddress());
6434      getFSImage().endCheckpoint(sig);
6435      success = true;
6436    } finally {
6437      readUnlock();
6438      RetryCache.setState(cacheEntry, success);
6439    }
6440  }
6441
6442  PermissionStatus createFsOwnerPermissions(FsPermission permission) {
6443    return new PermissionStatus(fsOwner.getShortUserName(), supergroup, permission);
6444  }
6445
6446  private void checkOwner(FSPermissionChecker pc, String path)
6447      throws AccessControlException, UnresolvedLinkException {
6448    checkPermission(pc, path, true, null, null, null, null);
6449  }
6450
6451  private void checkPathAccess(FSPermissionChecker pc,
6452      String path, FsAction access) throws AccessControlException,
6453      UnresolvedLinkException {
6454    checkPermission(pc, path, false, null, null, access, null);
6455  }
6456
6457  private void checkUnreadableBySuperuser(FSPermissionChecker pc,
6458      INode inode, int snapshotId)
6459      throws IOException {
6460    for (XAttr xattr : dir.getXAttrs(inode, snapshotId)) {
6461      if (XAttrHelper.getPrefixName(xattr).
6462          equals(SECURITY_XATTR_UNREADABLE_BY_SUPERUSER)) {
6463        if (pc.isSuperUser()) {
6464          throw new AccessControlException("Access is denied for " +
6465              pc.getUser() + " since the superuser is not allowed to " +
6466              "perform this operation.");
6467        }
6468      }
6469    }
6470  }
6471
6472  private void checkParentAccess(FSPermissionChecker pc,
6473      String path, FsAction access) throws AccessControlException,
6474      UnresolvedLinkException {
6475    checkPermission(pc, path, false, null, access, null, null);
6476  }
6477
6478  private void checkAncestorAccess(FSPermissionChecker pc,
6479      String path, FsAction access) throws AccessControlException,
6480      UnresolvedLinkException {
6481    checkPermission(pc, path, false, access, null, null, null);
6482  }
6483
6484  private void checkTraverse(FSPermissionChecker pc, String path)
6485      throws AccessControlException, UnresolvedLinkException {
6486    checkPermission(pc, path, false, null, null, null, null);
6487  }
6488
6489  /**
6490   * This is a wrapper for FSDirectory.resolvePath(). If the path passed
6491   * is prefixed with /.reserved/raw, then it checks to ensure that the caller
6492   * has super user privs.
6493   *
6494   * @param path The path to resolve.
6495   * @param pathComponents path components corresponding to the path
6496   * @return if the path indicates an inode, return path after replacing up to
6497   *         <inodeid> with the corresponding path of the inode, else the path
6498   *         in {@code src} as is. If the path refers to a path in the "raw"
6499   *         directory, return the non-raw pathname.
6500   * @throws FileNotFoundException
6501   * @throws AccessControlException
6502   */
6503  private String resolvePath(String path, byte[][] pathComponents)
6504      throws FileNotFoundException, AccessControlException {
6505    if (FSDirectory.isReservedRawName(path)) {
6506      checkSuperuserPrivilege();
6507    }
6508    return FSDirectory.resolvePath(path, pathComponents, dir);
6509  }
6510
6511  @Override
6512  public void checkSuperuserPrivilege()
6513      throws AccessControlException {
6514    if (isPermissionEnabled) {
6515      FSPermissionChecker pc = getPermissionChecker();
6516      pc.checkSuperuserPrivilege();
6517    }
6518  }
6519
6520  /**
6521   * Check whether current user have permissions to access the path. For more
6522   * details of the parameters, see
6523   * {@link FSPermissionChecker#checkPermission}.
6524   */
6525  private void checkPermission(FSPermissionChecker pc,
6526      String path, boolean doCheckOwner, FsAction ancestorAccess,
6527      FsAction parentAccess, FsAction access, FsAction subAccess)
6528      throws AccessControlException, UnresolvedLinkException {
6529        checkPermission(pc, path, doCheckOwner, ancestorAccess,
6530            parentAccess, access, subAccess, false, true);
6531  }
6532
6533  /**
6534   * Check whether current user have permissions to access the path. For more
6535   * details of the parameters, see
6536   * {@link FSPermissionChecker#checkPermission}.
6537   */
6538  private void checkPermission(FSPermissionChecker pc,
6539      String path, boolean doCheckOwner, FsAction ancestorAccess,
6540      FsAction parentAccess, FsAction access, FsAction subAccess,
6541      boolean ignoreEmptyDir, boolean resolveLink)
6542      throws AccessControlException, UnresolvedLinkException {
6543    if (!pc.isSuperUser()) {
6544      waitForLoadingFSImage();
6545      readLock();
6546      try {
6547        pc.checkPermission(path, dir, doCheckOwner, ancestorAccess,
6548            parentAccess, access, subAccess, ignoreEmptyDir, resolveLink);
6549      } finally {
6550        readUnlock();
6551      }
6552    }
6553  }
6554  
6555  /**
6556   * Check to see if we have exceeded the limit on the number
6557   * of inodes.
6558   */
6559  void checkFsObjectLimit() throws IOException {
6560    if (maxFsObjects != 0 &&
6561        maxFsObjects <= dir.totalInodes() + getBlocksTotal()) {
6562      throw new IOException("Exceeded the configured number of objects " +
6563                             maxFsObjects + " in the filesystem.");
6564    }
6565  }
6566
6567  /**
6568   * Get the total number of objects in the system. 
6569   */
6570  @Override // FSNamesystemMBean
6571  public long getMaxObjects() {
6572    return maxFsObjects;
6573  }
6574
6575  @Override // FSNamesystemMBean
6576  @Metric
6577  public long getFilesTotal() {
6578    // There is no need to take fSNamesystem's lock as
6579    // FSDirectory has its own lock.
6580    return this.dir.totalInodes();
6581  }
6582
6583  @Override // FSNamesystemMBean
6584  @Metric
6585  public long getPendingReplicationBlocks() {
6586    return blockManager.getPendingReplicationBlocksCount();
6587  }
6588
6589  @Override // FSNamesystemMBean
6590  @Metric
6591  public long getUnderReplicatedBlocks() {
6592    return blockManager.getUnderReplicatedBlocksCount();
6593  }
6594
6595  /** Returns number of blocks with corrupt replicas */
6596  @Metric({"CorruptBlocks", "Number of blocks with corrupt replicas"})
6597  public long getCorruptReplicaBlocks() {
6598    return blockManager.getCorruptReplicaBlocksCount();
6599  }
6600
6601  @Override // FSNamesystemMBean
6602  @Metric
6603  public long getScheduledReplicationBlocks() {
6604    return blockManager.getScheduledReplicationBlocksCount();
6605  }
6606
6607  @Override
6608  @Metric
6609  public long getPendingDeletionBlocks() {
6610    return blockManager.getPendingDeletionBlocksCount();
6611  }
6612
6613  @Override
6614  public long getBlockDeletionStartTime() {
6615    return startTime + blockManager.getStartupDelayBlockDeletionInMs();
6616  }
6617
6618  @Metric
6619  public long getExcessBlocks() {
6620    return blockManager.getExcessBlocksCount();
6621  }
6622  
6623  // HA-only metric
6624  @Metric
6625  public long getPostponedMisreplicatedBlocks() {
6626    return blockManager.getPostponedMisreplicatedBlocksCount();
6627  }
6628
6629  // HA-only metric
6630  @Metric
6631  public int getPendingDataNodeMessageCount() {
6632    return blockManager.getPendingDataNodeMessageCount();
6633  }
6634  
6635  // HA-only metric
6636  @Metric
6637  public String getHAState() {
6638    return haContext.getState().toString();
6639  }
6640
6641  // HA-only metric
6642  @Metric
6643  public long getMillisSinceLastLoadedEdits() {
6644    if (isInStandbyState() && editLogTailer != null) {
6645      return now() - editLogTailer.getLastLoadTimestamp();
6646    } else {
6647      return 0;
6648    }
6649  }
6650  
6651  @Metric
6652  public int getBlockCapacity() {
6653    return blockManager.getCapacity();
6654  }
6655
6656  @Override // FSNamesystemMBean
6657  public String getFSState() {
6658    return isInSafeMode() ? "safeMode" : "Operational";
6659  }
6660  
6661  private ObjectName mbeanName;
6662  private ObjectName mxbeanName;
6663
6664  /**
6665   * Register the FSNamesystem MBean using the name
6666   *        "hadoop:service=NameNode,name=FSNamesystemState"
6667   */
6668  private void registerMBean() {
6669    // We can only implement one MXBean interface, so we keep the old one.
6670    try {
6671      StandardMBean bean = new StandardMBean(this, FSNamesystemMBean.class);
6672      mbeanName = MBeans.register("NameNode", "FSNamesystemState", bean);
6673    } catch (NotCompliantMBeanException e) {
6674      throw new RuntimeException("Bad MBean setup", e);
6675    }
6676
6677    LOG.info("Registered FSNamesystemState MBean");
6678  }
6679
6680  /**
6681   * shutdown FSNamesystem
6682   */
6683  void shutdown() {
6684    if (snapshotManager != null) {
6685      snapshotManager.shutdown();
6686    }
6687    if (mbeanName != null) {
6688      MBeans.unregister(mbeanName);
6689      mbeanName = null;
6690    }
6691    if (mxbeanName != null) {
6692      MBeans.unregister(mxbeanName);
6693      mxbeanName = null;
6694    }
6695    if (dir != null) {
6696      dir.shutdown();
6697    }
6698    if (blockManager != null) {
6699      blockManager.shutdown();
6700    }
6701  }
6702
6703  @Override // FSNamesystemMBean
6704  public int getNumLiveDataNodes() {
6705    return getBlockManager().getDatanodeManager().getNumLiveDataNodes();
6706  }
6707
6708  @Override // FSNamesystemMBean
6709  public int getNumDeadDataNodes() {
6710    return getBlockManager().getDatanodeManager().getNumDeadDataNodes();
6711  }
6712  
6713  @Override // FSNamesystemMBean
6714  public int getNumDecomLiveDataNodes() {
6715    final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
6716    getBlockManager().getDatanodeManager().fetchDatanodes(live, null, true);
6717    int liveDecommissioned = 0;
6718    for (DatanodeDescriptor node : live) {
6719      liveDecommissioned += node.isDecommissioned() ? 1 : 0;
6720    }
6721    return liveDecommissioned;
6722  }
6723
6724  @Override // FSNamesystemMBean
6725  public int getNumDecomDeadDataNodes() {
6726    final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
6727    getBlockManager().getDatanodeManager().fetchDatanodes(null, dead, true);
6728    int deadDecommissioned = 0;
6729    for (DatanodeDescriptor node : dead) {
6730      deadDecommissioned += node.isDecommissioned() ? 1 : 0;
6731    }
6732    return deadDecommissioned;
6733  }
6734
6735  @Override // FSNamesystemMBean
6736  public int getNumDecommissioningDataNodes() {
6737    return getBlockManager().getDatanodeManager().getDecommissioningNodes()
6738        .size();
6739  }
6740
6741  @Override // FSNamesystemMBean
6742  @Metric({"StaleDataNodes", 
6743    "Number of datanodes marked stale due to delayed heartbeat"})
6744  public int getNumStaleDataNodes() {
6745    return getBlockManager().getDatanodeManager().getNumStaleNodes();
6746  }
6747
6748  /**
6749   * Storages are marked as "content stale" after NN restart or fails over and
6750   * before NN receives the first Heartbeat followed by the first Blockreport.
6751   */
6752  @Override // FSNamesystemMBean
6753  public int getNumStaleStorages() {
6754    return getBlockManager().getDatanodeManager().getNumStaleStorages();
6755  }
6756
6757  /**
6758   * Sets the current generation stamp for legacy blocks
6759   */
6760  void setGenerationStampV1(long stamp) {
6761    generationStampV1.setCurrentValue(stamp);
6762  }
6763
6764  /**
6765   * Gets the current generation stamp for legacy blocks
6766   */
6767  long getGenerationStampV1() {
6768    return generationStampV1.getCurrentValue();
6769  }
6770
6771  /**
6772   * Gets the current generation stamp for this filesystem
6773   */
6774  void setGenerationStampV2(long stamp) {
6775    generationStampV2.setCurrentValue(stamp);
6776  }
6777
6778  /**
6779   * Gets the current generation stamp for this filesystem
6780   */
6781  long getGenerationStampV2() {
6782    return generationStampV2.getCurrentValue();
6783  }
6784
6785  /**
6786   * Upgrades the generation stamp for the filesystem
6787   * by reserving a sufficient range for all existing blocks.
6788   * Should be invoked only during the first upgrade to
6789   * sequential block IDs.
6790   */
6791  long upgradeGenerationStampToV2() {
6792    Preconditions.checkState(generationStampV2.getCurrentValue() ==
6793        GenerationStamp.LAST_RESERVED_STAMP);
6794
6795    generationStampV2.skipTo(
6796        generationStampV1.getCurrentValue() +
6797        HdfsConstants.RESERVED_GENERATION_STAMPS_V1);
6798
6799    generationStampV1Limit = generationStampV2.getCurrentValue();
6800    return generationStampV2.getCurrentValue();
6801  }
6802
6803  /**
6804   * Sets the generation stamp that delineates random and sequentially
6805   * allocated block IDs.
6806   * @param stamp set generation stamp limit to this value
6807   */
6808  void setGenerationStampV1Limit(long stamp) {
6809    Preconditions.checkState(generationStampV1Limit ==
6810                             GenerationStamp.GRANDFATHER_GENERATION_STAMP);
6811    generationStampV1Limit = stamp;
6812  }
6813
6814  /**
6815   * Gets the value of the generation stamp that delineates sequential
6816   * and random block IDs.
6817   */
6818  long getGenerationStampAtblockIdSwitch() {
6819    return generationStampV1Limit;
6820  }
6821
6822  @VisibleForTesting
6823  SequentialBlockIdGenerator getBlockIdGenerator() {
6824    return blockIdGenerator;
6825  }
6826
6827  /**
6828   * Sets the maximum allocated block ID for this filesystem. This is
6829   * the basis for allocating new block IDs.
6830   */
6831  void setLastAllocatedBlockId(long blockId) {
6832    blockIdGenerator.skipTo(blockId);
6833  }
6834
6835  /**
6836   * Gets the maximum sequentially allocated block ID for this filesystem
6837   */
6838  long getLastAllocatedBlockId() {
6839    return blockIdGenerator.getCurrentValue();
6840  }
6841
6842  /**
6843   * Increments, logs and then returns the stamp
6844   */
6845  long nextGenerationStamp(boolean legacyBlock)
6846      throws IOException, SafeModeException {
6847    assert hasWriteLock();
6848    checkNameNodeSafeMode("Cannot get next generation stamp");
6849
6850    long gs;
6851    if (legacyBlock) {
6852      gs = getNextGenerationStampV1();
6853      getEditLog().logGenerationStampV1(gs);
6854    } else {
6855      gs = getNextGenerationStampV2();
6856      getEditLog().logGenerationStampV2(gs);
6857    }
6858
6859    // NB: callers sync the log
6860    return gs;
6861  }
6862
6863  @VisibleForTesting
6864  long getNextGenerationStampV1() throws IOException {
6865    long genStampV1 = generationStampV1.nextValue();
6866
6867    if (genStampV1 >= generationStampV1Limit) {
6868      // We ran out of generation stamps for legacy blocks. In practice, it
6869      // is extremely unlikely as we reserved 1T v1 generation stamps. The
6870      // result is that we can no longer append to the legacy blocks that
6871      // were created before the upgrade to sequential block IDs.
6872      throw new OutOfV1GenerationStampsException();
6873    }
6874
6875    return genStampV1;
6876  }
6877
6878  @VisibleForTesting
6879  long getNextGenerationStampV2() {
6880    return generationStampV2.nextValue();
6881  }
6882
6883  long getGenerationStampV1Limit() {
6884    return generationStampV1Limit;
6885  }
6886
6887  /**
6888   * Determine whether the block ID was randomly generated (legacy) or
6889   * sequentially generated. The generation stamp value is used to
6890   * make the distinction.
6891   * @return true if the block ID was randomly generated, false otherwise.
6892   */
6893  boolean isLegacyBlock(Block block) {
6894    return block.getGenerationStamp() < getGenerationStampV1Limit();
6895  }
6896
6897  /**
6898   * Increments, logs and then returns the block ID
6899   */
6900  private long nextBlockId() throws IOException {
6901    assert hasWriteLock();
6902    checkNameNodeSafeMode("Cannot get next block ID");
6903    final long blockId = blockIdGenerator.nextValue();
6904    getEditLog().logAllocateBlockId(blockId);
6905    // NB: callers sync the log
6906    return blockId;
6907  }
6908
6909  private boolean isFileDeleted(INodeFile file) {
6910    // Not in the inodeMap or in the snapshot but marked deleted.
6911    if (dir.getInode(file.getId()) == null) {
6912      return true;
6913    }
6914
6915    // look at the path hierarchy to see if one parent is deleted by recursive
6916    // deletion
6917    INode tmpChild = file;
6918    INodeDirectory tmpParent = file.getParent();
6919    while (true) {
6920      if (tmpParent == null) {
6921        return true;
6922      }
6923
6924      INode childINode = tmpParent.getChild(tmpChild.getLocalNameBytes(),
6925          Snapshot.CURRENT_STATE_ID);
6926      if (childINode == null || !childINode.equals(tmpChild)) {
6927        // a newly created INode with the same name as an already deleted one
6928        // would be a different INode than the deleted one
6929        return true;
6930      }
6931
6932      if (tmpParent.isRoot()) {
6933        break;
6934      }
6935
6936      tmpChild = tmpParent;
6937      tmpParent = tmpParent.getParent();
6938    }
6939
6940    if (file.isWithSnapshot() &&
6941        file.getFileWithSnapshotFeature().isCurrentFileDeleted()) {
6942      return true;
6943    }
6944    return false;
6945  }
6946
6947  private INodeFile checkUCBlock(ExtendedBlock block,
6948      String clientName) throws IOException {
6949    assert hasWriteLock();
6950    checkNameNodeSafeMode("Cannot get a new generation stamp and an "
6951        + "access token for block " + block);
6952    
6953    // check stored block state
6954    BlockInfo storedBlock = getStoredBlock(ExtendedBlock.getLocalBlock(block));
6955    if (storedBlock == null || 
6956        storedBlock.getBlockUCState() != BlockUCState.UNDER_CONSTRUCTION) {
6957        throw new IOException(block + 
6958            " does not exist or is not under Construction" + storedBlock);
6959    }
6960    
6961    // check file inode
6962    final INodeFile file = ((INode)storedBlock.getBlockCollection()).asFile();
6963    if (file == null || !file.isUnderConstruction() || isFileDeleted(file)) {
6964      throw new IOException("The file " + storedBlock + 
6965          " belonged to does not exist or it is not under construction.");
6966    }
6967    
6968    // check lease
6969    if (clientName == null
6970        || !clientName.equals(file.getFileUnderConstructionFeature()
6971            .getClientName())) {
6972      throw new LeaseExpiredException("Lease mismatch: " + block + 
6973          " is accessed by a non lease holder " + clientName); 
6974    }
6975
6976    return file;
6977  }
6978  
6979  /**
6980   * Client is reporting some bad block locations.
6981   */
6982  void reportBadBlocks(LocatedBlock[] blocks) throws IOException {
6983    checkOperation(OperationCategory.WRITE);
6984    NameNode.stateChangeLog.info("*DIR* reportBadBlocks");
6985    writeLock();
6986    try {
6987      checkOperation(OperationCategory.WRITE);
6988      for (int i = 0; i < blocks.length; i++) {
6989        ExtendedBlock blk = blocks[i].getBlock();
6990        DatanodeInfo[] nodes = blocks[i].getLocations();
6991        String[] storageIDs = blocks[i].getStorageIDs();
6992        for (int j = 0; j < nodes.length; j++) {
6993          blockManager.findAndMarkBlockAsCorrupt(blk, nodes[j],
6994              storageIDs == null ? null: storageIDs[j], 
6995              "client machine reported it");
6996        }
6997      }
6998    } finally {
6999      writeUnlock();
7000    }
7001  }
7002
7003  /**
7004   * Get a new generation stamp together with an access token for 
7005   * a block under construction
7006   * 
7007   * This method is called for recovering a failed pipeline or setting up
7008   * a pipeline to append to a block.
7009   * 
7010   * @param block a block
7011   * @param clientName the name of a client
7012   * @return a located block with a new generation stamp and an access token
7013   * @throws IOException if any error occurs
7014   */
7015  LocatedBlock updateBlockForPipeline(ExtendedBlock block, 
7016      String clientName) throws IOException {
7017    LocatedBlock locatedBlock;
7018    checkOperation(OperationCategory.WRITE);
7019    writeLock();
7020    try {
7021      checkOperation(OperationCategory.WRITE);
7022
7023      // check vadility of parameters
7024      checkUCBlock(block, clientName);
7025  
7026      // get a new generation stamp and an access token
7027      block.setGenerationStamp(
7028          nextGenerationStamp(isLegacyBlock(block.getLocalBlock())));
7029      locatedBlock = new LocatedBlock(block, new DatanodeInfo[0]);
7030      blockManager.setBlockToken(locatedBlock, AccessMode.WRITE);
7031    } finally {
7032      writeUnlock();
7033    }
7034    // Ensure we record the new generation stamp
7035    getEditLog().logSync();
7036    return locatedBlock;
7037  }
7038  
7039  /**
7040   * Update a pipeline for a block under construction
7041   * 
7042   * @param clientName the name of the client
7043   * @param oldBlock and old block
7044   * @param newBlock a new block with a new generation stamp and length
7045   * @param newNodes datanodes in the pipeline
7046   * @throws IOException if any error occurs
7047   */
7048  void updatePipeline(String clientName, ExtendedBlock oldBlock, 
7049      ExtendedBlock newBlock, DatanodeID[] newNodes, String[] newStorageIDs)
7050      throws IOException {
7051    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
7052    if (cacheEntry != null && cacheEntry.isSuccess()) {
7053      return; // Return previous response
7054    }
7055    LOG.info("updatePipeline(block=" + oldBlock
7056             + ", newGenerationStamp=" + newBlock.getGenerationStamp()
7057             + ", newLength=" + newBlock.getNumBytes()
7058             + ", newNodes=" + Arrays.asList(newNodes)
7059             + ", clientName=" + clientName
7060             + ")");
7061    waitForLoadingFSImage();
7062    writeLock();
7063    boolean success = false;
7064    try {
7065      checkOperation(OperationCategory.WRITE);
7066      checkNameNodeSafeMode("Pipeline not updated");
7067      assert newBlock.getBlockId()==oldBlock.getBlockId() : newBlock + " and "
7068        + oldBlock + " has different block identifier";
7069      updatePipelineInternal(clientName, oldBlock, newBlock, newNodes,
7070          newStorageIDs, cacheEntry != null);
7071      success = true;
7072    } finally {
7073      writeUnlock();
7074      RetryCache.setState(cacheEntry, success);
7075    }
7076    getEditLog().logSync();
7077    LOG.info("updatePipeline(" + oldBlock + ") successfully to " + newBlock);
7078  }
7079
7080  /**
7081   * @see #updatePipeline(String,  ExtendedBlock, ExtendedBlock, DatanodeID[], String[])
7082   */
7083  private void updatePipelineInternal(String clientName, ExtendedBlock oldBlock, 
7084      ExtendedBlock newBlock, DatanodeID[] newNodes, String[] newStorageIDs,
7085      boolean logRetryCache)
7086      throws IOException {
7087    assert hasWriteLock();
7088    // check the vadility of the block and lease holder name
7089    final INodeFile pendingFile = checkUCBlock(oldBlock, clientName);
7090    final BlockInfoUnderConstruction blockinfo
7091        = (BlockInfoUnderConstruction)pendingFile.getLastBlock();
7092
7093    // check new GS & length: this is not expected
7094    if (newBlock.getGenerationStamp() <= blockinfo.getGenerationStamp() ||
7095        newBlock.getNumBytes() < blockinfo.getNumBytes()) {
7096      String msg = "Update " + oldBlock + " (len = " + 
7097        blockinfo.getNumBytes() + ") to an older state: " + newBlock + 
7098        " (len = " + newBlock.getNumBytes() +")";
7099      LOG.warn(msg);
7100      throw new IOException(msg);
7101    }
7102
7103    // Update old block with the new generation stamp and new length
7104    blockinfo.setNumBytes(newBlock.getNumBytes());
7105    blockinfo.setGenerationStampAndVerifyReplicas(newBlock.getGenerationStamp());
7106
7107    // find the DatanodeDescriptor objects
7108    final DatanodeStorageInfo[] storages = blockManager.getDatanodeManager()
7109        .getDatanodeStorageInfos(newNodes, newStorageIDs);
7110    blockinfo.setExpectedLocations(storages);
7111
7112    String src = pendingFile.getFullPathName();
7113    persistBlocks(src, pendingFile, logRetryCache);
7114  }
7115
7116  // rename was successful. If any part of the renamed subtree had
7117  // files that were being written to, update with new filename.
7118  void unprotectedChangeLease(String src, String dst) {
7119    assert hasWriteLock();
7120    leaseManager.changeLease(src, dst);
7121  }
7122
7123  /**
7124   * Serializes leases.
7125   */
7126  void saveFilesUnderConstruction(DataOutputStream out,
7127      Map<Long, INodeFile> snapshotUCMap) throws IOException {
7128    // This is run by an inferior thread of saveNamespace, which holds a read
7129    // lock on our behalf. If we took the read lock here, we could block
7130    // for fairness if a writer is waiting on the lock.
7131    synchronized (leaseManager) {
7132      Map<String, INodeFile> nodes = leaseManager.getINodesUnderConstruction();
7133      for (Map.Entry<String, INodeFile> entry : nodes.entrySet()) {
7134        // TODO: for HDFS-5428, because of rename operations, some
7135        // under-construction files that are
7136        // in the current fs directory can also be captured in the
7137        // snapshotUCMap. We should remove them from the snapshotUCMap.
7138        snapshotUCMap.remove(entry.getValue().getId());
7139      }
7140
7141      out.writeInt(nodes.size() + snapshotUCMap.size()); // write the size
7142      for (Map.Entry<String, INodeFile> entry : nodes.entrySet()) {
7143        FSImageSerialization.writeINodeUnderConstruction(
7144            out, entry.getValue(), entry.getKey());
7145      }
7146      for (Map.Entry<Long, INodeFile> entry : snapshotUCMap.entrySet()) {
7147        // for those snapshot INodeFileUC, we use "/.reserved/.inodes/<inodeid>"
7148        // as their paths
7149        StringBuilder b = new StringBuilder();
7150        b.append(FSDirectory.DOT_RESERVED_PATH_PREFIX)
7151            .append(Path.SEPARATOR).append(FSDirectory.DOT_INODES_STRING)
7152            .append(Path.SEPARATOR).append(entry.getValue().getId());
7153        FSImageSerialization.writeINodeUnderConstruction(
7154            out, entry.getValue(), b.toString());
7155      }
7156    }
7157  }
7158
7159  /**
7160   * @return all the under-construction files in the lease map
7161   */
7162  Map<String, INodeFile> getFilesUnderConstruction() {
7163    synchronized (leaseManager) {
7164      return leaseManager.getINodesUnderConstruction();
7165    }
7166  }
7167
7168  /**
7169   * Register a Backup name-node, verifying that it belongs
7170   * to the correct namespace, and adding it to the set of
7171   * active journals if necessary.
7172   * 
7173   * @param bnReg registration of the new BackupNode
7174   * @param nnReg registration of this NameNode
7175   * @throws IOException if the namespace IDs do not match
7176   */
7177  void registerBackupNode(NamenodeRegistration bnReg,
7178      NamenodeRegistration nnReg) throws IOException {
7179    writeLock();
7180    try {
7181      if(getFSImage().getStorage().getNamespaceID() 
7182         != bnReg.getNamespaceID())
7183        throw new IOException("Incompatible namespaceIDs: "
7184            + " Namenode namespaceID = "
7185            + getFSImage().getStorage().getNamespaceID() + "; "
7186            + bnReg.getRole() +
7187            " node namespaceID = " + bnReg.getNamespaceID());
7188      if (bnReg.getRole() == NamenodeRole.BACKUP) {
7189        getFSImage().getEditLog().registerBackupNode(
7190            bnReg, nnReg);
7191      }
7192    } finally {
7193      writeUnlock();
7194    }
7195  }
7196
7197  /**
7198   * Release (unregister) backup node.
7199   * <p>
7200   * Find and remove the backup stream corresponding to the node.
7201   * @throws IOException
7202   */
7203  void releaseBackupNode(NamenodeRegistration registration)
7204    throws IOException {
7205    checkOperation(OperationCategory.WRITE);
7206    writeLock();
7207    try {
7208      checkOperation(OperationCategory.WRITE);
7209      if(getFSImage().getStorage().getNamespaceID()
7210         != registration.getNamespaceID())
7211        throw new IOException("Incompatible namespaceIDs: "
7212            + " Namenode namespaceID = "
7213            + getFSImage().getStorage().getNamespaceID() + "; "
7214            + registration.getRole() +
7215            " node namespaceID = " + registration.getNamespaceID());
7216      getEditLog().releaseBackupStream(registration);
7217    } finally {
7218      writeUnlock();
7219    }
7220  }
7221
7222  static class CorruptFileBlockInfo {
7223    final String path;
7224    final Block block;
7225    
7226    public CorruptFileBlockInfo(String p, Block b) {
7227      path = p;
7228      block = b;
7229    }
7230    
7231    @Override
7232    public String toString() {
7233      return block.getBlockName() + "\t" + path;
7234    }
7235  }
7236  /**
7237   * @param path Restrict corrupt files to this portion of namespace.
7238   * @param cookieTab Support for continuation; cookieTab  tells where
7239   *                  to start from
7240   * @return a list in which each entry describes a corrupt file/block
7241   * @throws IOException
7242   */
7243  Collection<CorruptFileBlockInfo> listCorruptFileBlocks(String path,
7244  String[] cookieTab) throws IOException {
7245    checkSuperuserPrivilege();
7246    checkOperation(OperationCategory.READ);
7247
7248    int count = 0;
7249    ArrayList<CorruptFileBlockInfo> corruptFiles =
7250        new ArrayList<CorruptFileBlockInfo>();
7251    if (cookieTab == null) {
7252      cookieTab = new String[] { null };
7253    }
7254
7255    // Do a quick check if there are any corrupt files without taking the lock
7256    if (blockManager.getMissingBlocksCount() == 0) {
7257      if (cookieTab[0] == null) {
7258        cookieTab[0] = String.valueOf(getIntCookie(cookieTab[0]));
7259      }
7260      LOG.info("there are no corrupt file blocks.");
7261      return corruptFiles;
7262    }
7263
7264    readLock();
7265    try {
7266      checkOperation(OperationCategory.READ);
7267      if (!isPopulatingReplQueues()) {
7268        throw new IOException("Cannot run listCorruptFileBlocks because " +
7269                              "replication queues have not been initialized.");
7270      }
7271      // print a limited # of corrupt files per call
7272
7273      final Iterator<Block> blkIterator = blockManager.getCorruptReplicaBlockIterator();
7274
7275      int skip = getIntCookie(cookieTab[0]);
7276      for (int i = 0; i < skip && blkIterator.hasNext(); i++) {
7277        blkIterator.next();
7278      }
7279
7280      while (blkIterator.hasNext()) {
7281        Block blk = blkIterator.next();
7282        final INode inode = (INode)blockManager.getBlockCollection(blk);
7283        skip++;
7284        if (inode != null && blockManager.countNodes(blk).liveReplicas() == 0) {
7285          String src = FSDirectory.getFullPathName(inode);
7286          if (src.startsWith(path)){
7287            corruptFiles.add(new CorruptFileBlockInfo(src, blk));
7288            count++;
7289            if (count >= DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED)
7290              break;
7291          }
7292        }
7293      }
7294      cookieTab[0] = String.valueOf(skip);
7295      LOG.info("list corrupt file blocks returned: " + count);
7296      return corruptFiles;
7297    } finally {
7298      readUnlock();
7299    }
7300  }
7301
7302  /**
7303   * Convert string cookie to integer.
7304   */
7305  private static int getIntCookie(String cookie){
7306    int c;
7307    if(cookie == null){
7308      c = 0;
7309    } else {
7310      try{
7311        c = Integer.parseInt(cookie);
7312      }catch (NumberFormatException e) {
7313        c = 0;
7314      }
7315    }
7316    c = Math.max(0, c);
7317    return c;
7318  }
7319
7320  /**
7321   * Create delegation token secret manager
7322   */
7323  private DelegationTokenSecretManager createDelegationTokenSecretManager(
7324      Configuration conf) {
7325    return new DelegationTokenSecretManager(conf.getLong(
7326        DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY,
7327        DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT),
7328        conf.getLong(DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY,
7329            DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT),
7330        conf.getLong(DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY,
7331            DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT),
7332        DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL,
7333        conf.getBoolean(DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY,
7334            DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT),
7335        this);
7336  }
7337
7338  /**
7339   * Returns the DelegationTokenSecretManager instance in the namesystem.
7340   * @return delegation token secret manager object
7341   */
7342  DelegationTokenSecretManager getDelegationTokenSecretManager() {
7343    return dtSecretManager;
7344  }
7345
7346  /**
7347   * @param renewer Renewer information
7348   * @return delegation toek
7349   * @throws IOException on error
7350   */
7351  Token<DelegationTokenIdentifier> getDelegationToken(Text renewer)
7352      throws IOException {
7353    Token<DelegationTokenIdentifier> token;
7354    checkOperation(OperationCategory.WRITE);
7355    writeLock();
7356    try {
7357      checkOperation(OperationCategory.WRITE);
7358      checkNameNodeSafeMode("Cannot issue delegation token");
7359      if (!isAllowedDelegationTokenOp()) {
7360        throw new IOException(
7361          "Delegation Token can be issued only with kerberos or web authentication");
7362      }
7363      if (dtSecretManager == null || !dtSecretManager.isRunning()) {
7364        LOG.warn("trying to get DT with no secret manager running");
7365        return null;
7366      }
7367
7368      UserGroupInformation ugi = getRemoteUser();
7369      String user = ugi.getUserName();
7370      Text owner = new Text(user);
7371      Text realUser = null;
7372      if (ugi.getRealUser() != null) {
7373        realUser = new Text(ugi.getRealUser().getUserName());
7374      }
7375      DelegationTokenIdentifier dtId = new DelegationTokenIdentifier(owner,
7376        renewer, realUser);
7377      token = new Token<DelegationTokenIdentifier>(
7378        dtId, dtSecretManager);
7379      long expiryTime = dtSecretManager.getTokenExpiryTime(dtId);
7380      getEditLog().logGetDelegationToken(dtId, expiryTime);
7381    } finally {
7382      writeUnlock();
7383    }
7384    getEditLog().logSync();
7385    return token;
7386  }
7387
7388  /**
7389   * 
7390   * @param token token to renew
7391   * @return new expiryTime of the token
7392   * @throws InvalidToken if {@code token} is invalid
7393   * @throws IOException on other errors
7394   */
7395  long renewDelegationToken(Token<DelegationTokenIdentifier> token)
7396      throws InvalidToken, IOException {
7397    long expiryTime;
7398    checkOperation(OperationCategory.WRITE);
7399    writeLock();
7400    try {
7401      checkOperation(OperationCategory.WRITE);
7402
7403      checkNameNodeSafeMode("Cannot renew delegation token");
7404      if (!isAllowedDelegationTokenOp()) {
7405        throw new IOException(
7406            "Delegation Token can be renewed only with kerberos or web authentication");
7407      }
7408      String renewer = getRemoteUser().getShortUserName();
7409      expiryTime = dtSecretManager.renewToken(token, renewer);
7410      DelegationTokenIdentifier id = new DelegationTokenIdentifier();
7411      ByteArrayInputStream buf = new ByteArrayInputStream(token.getIdentifier());
7412      DataInputStream in = new DataInputStream(buf);
7413      id.readFields(in);
7414      getEditLog().logRenewDelegationToken(id, expiryTime);
7415    } finally {
7416      writeUnlock();
7417    }
7418    getEditLog().logSync();
7419    return expiryTime;
7420  }
7421
7422  /**
7423   * 
7424   * @param token token to cancel
7425   * @throws IOException on error
7426   */
7427  void cancelDelegationToken(Token<DelegationTokenIdentifier> token)
7428      throws IOException {
7429    checkOperation(OperationCategory.WRITE);
7430    writeLock();
7431    try {
7432      checkOperation(OperationCategory.WRITE);
7433
7434      checkNameNodeSafeMode("Cannot cancel delegation token");
7435      String canceller = getRemoteUser().getUserName();
7436      DelegationTokenIdentifier id = dtSecretManager
7437        .cancelToken(token, canceller);
7438      getEditLog().logCancelDelegationToken(id);
7439    } finally {
7440      writeUnlock();
7441    }
7442    getEditLog().logSync();
7443  }
7444
7445  /**
7446   * @param out save state of the secret manager
7447   * @param sdPath String storage directory path
7448   */
7449  void saveSecretManagerStateCompat(DataOutputStream out, String sdPath)
7450      throws IOException {
7451    dtSecretManager.saveSecretManagerStateCompat(out, sdPath);
7452  }
7453
7454  SecretManagerState saveSecretManagerState() {
7455    return dtSecretManager.saveSecretManagerState();
7456  }
7457
7458  /**
7459   * @param in load the state of secret manager from input stream
7460   */
7461  void loadSecretManagerStateCompat(DataInput in) throws IOException {
7462    dtSecretManager.loadSecretManagerStateCompat(in);
7463  }
7464
7465  void loadSecretManagerState(SecretManagerSection s,
7466      List<SecretManagerSection.DelegationKey> keys,
7467      List<SecretManagerSection.PersistToken> tokens) throws IOException {
7468    dtSecretManager.loadSecretManagerState(new SecretManagerState(s, keys, tokens));
7469  }
7470
7471  /**
7472   * Log the updateMasterKey operation to edit logs
7473   * 
7474   * @param key new delegation key.
7475   */
7476  public void logUpdateMasterKey(DelegationKey key) {
7477    
7478    assert !isInSafeMode() :
7479      "this should never be called while in safemode, since we stop " +
7480      "the DT manager before entering safemode!";
7481    // No need to hold FSN lock since we don't access any internal
7482    // structures, and this is stopped before the FSN shuts itself
7483    // down, etc.
7484    getEditLog().logUpdateMasterKey(key);
7485    getEditLog().logSync();
7486  }
7487  
7488  /**
7489   * Log the cancellation of expired tokens to edit logs
7490   * 
7491   * @param id token identifier to cancel
7492   */
7493  public void logExpireDelegationToken(DelegationTokenIdentifier id) {
7494    assert !isInSafeMode() :
7495      "this should never be called while in safemode, since we stop " +
7496      "the DT manager before entering safemode!";
7497    // No need to hold FSN lock since we don't access any internal
7498    // structures, and this is stopped before the FSN shuts itself
7499    // down, etc.
7500    getEditLog().logCancelDelegationToken(id);
7501  }  
7502  
7503  private void logReassignLease(String leaseHolder, String src,
7504      String newHolder) {
7505    assert hasWriteLock();
7506    getEditLog().logReassignLease(leaseHolder, src, newHolder);
7507  }
7508  
7509  /**
7510   * 
7511   * @return true if delegation token operation is allowed
7512   */
7513  private boolean isAllowedDelegationTokenOp() throws IOException {
7514    AuthenticationMethod authMethod = getConnectionAuthenticationMethod();
7515    if (UserGroupInformation.isSecurityEnabled()
7516        && (authMethod != AuthenticationMethod.KERBEROS)
7517        && (authMethod != AuthenticationMethod.KERBEROS_SSL)
7518        && (authMethod != AuthenticationMethod.CERTIFICATE)) {
7519      return false;
7520    }
7521    return true;
7522  }
7523  
7524  /**
7525   * Returns authentication method used to establish the connection
7526   * @return AuthenticationMethod used to establish connection
7527   * @throws IOException
7528   */
7529  private AuthenticationMethod getConnectionAuthenticationMethod()
7530      throws IOException {
7531    UserGroupInformation ugi = getRemoteUser();
7532    AuthenticationMethod authMethod = ugi.getAuthenticationMethod();
7533    if (authMethod == AuthenticationMethod.PROXY) {
7534      authMethod = ugi.getRealUser().getAuthenticationMethod();
7535    }
7536    return authMethod;
7537  }
7538  
7539  /**
7540   * Client invoked methods are invoked over RPC and will be in 
7541   * RPC call context even if the client exits.
7542   */
7543  private boolean isExternalInvocation() {
7544    return Server.isRpcInvocation() || NamenodeWebHdfsMethods.isWebHdfsInvocation();
7545  }
7546
7547  private static InetAddress getRemoteIp() {
7548    InetAddress ip = Server.getRemoteIp();
7549    if (ip != null) {
7550      return ip;
7551    }
7552    return NamenodeWebHdfsMethods.getRemoteIp();
7553  }
7554  
7555  // optimize ugi lookup for RPC operations to avoid a trip through
7556  // UGI.getCurrentUser which is synch'ed
7557  private static UserGroupInformation getRemoteUser() throws IOException {
7558    return NameNode.getRemoteUser();
7559  }
7560  
7561  /**
7562   * Log fsck event in the audit log 
7563   */
7564  void logFsckEvent(String src, InetAddress remoteAddress) throws IOException {
7565    if (isAuditEnabled()) {
7566      logAuditEvent(true, getRemoteUser(),
7567                    remoteAddress,
7568                    "fsck", src, null, null);
7569    }
7570  }
7571  /**
7572   * Register NameNodeMXBean
7573   */
7574  private void registerMXBean() {
7575    mxbeanName = MBeans.register("NameNode", "NameNodeInfo", this);
7576  }
7577
7578  /**
7579   * Class representing Namenode information for JMX interfaces
7580   */
7581  @Override // NameNodeMXBean
7582  public String getVersion() {
7583    return VersionInfo.getVersion() + ", r" + VersionInfo.getRevision();
7584  }
7585
7586  @Override // NameNodeMXBean
7587  public long getUsed() {
7588    return this.getCapacityUsed();
7589  }
7590
7591  @Override // NameNodeMXBean
7592  public long getFree() {
7593    return this.getCapacityRemaining();
7594  }
7595
7596  @Override // NameNodeMXBean
7597  public long getTotal() {
7598    return this.getCapacityTotal();
7599  }
7600
7601  @Override // NameNodeMXBean
7602  public String getSafemode() {
7603    if (!this.isInSafeMode())
7604      return "";
7605    return "Safe mode is ON. " + this.getSafeModeTip();
7606  }
7607
7608  @Override // NameNodeMXBean
7609  public boolean isUpgradeFinalized() {
7610    return this.getFSImage().isUpgradeFinalized();
7611  }
7612
7613  @Override // NameNodeMXBean
7614  public long getNonDfsUsedSpace() {
7615    return datanodeStatistics.getCapacityUsedNonDFS();
7616  }
7617
7618  @Override // NameNodeMXBean
7619  public float getPercentUsed() {
7620    return datanodeStatistics.getCapacityUsedPercent();
7621  }
7622
7623  @Override // NameNodeMXBean
7624  public long getBlockPoolUsedSpace() {
7625    return datanodeStatistics.getBlockPoolUsed();
7626  }
7627
7628  @Override // NameNodeMXBean
7629  public float getPercentBlockPoolUsed() {
7630    return datanodeStatistics.getPercentBlockPoolUsed();
7631  }
7632
7633  @Override // NameNodeMXBean
7634  public float getPercentRemaining() {
7635    return datanodeStatistics.getCapacityRemainingPercent();
7636  }
7637
7638  @Override // NameNodeMXBean
7639  public long getCacheCapacity() {
7640    return datanodeStatistics.getCacheCapacity();
7641  }
7642
7643  @Override // NameNodeMXBean
7644  public long getCacheUsed() {
7645    return datanodeStatistics.getCacheUsed();
7646  }
7647
7648  @Override // NameNodeMXBean
7649  public long getTotalBlocks() {
7650    return getBlocksTotal();
7651  }
7652
7653  @Override // NameNodeMXBean
7654  @Metric
7655  public long getTotalFiles() {
7656    return getFilesTotal();
7657  }
7658
7659  @Override // NameNodeMXBean
7660  public long getNumberOfMissingBlocks() {
7661    return getMissingBlocksCount();
7662  }
7663  
7664  @Override // NameNodeMXBean
7665  public int getThreads() {
7666    return ManagementFactory.getThreadMXBean().getThreadCount();
7667  }
7668
7669  /**
7670   * Returned information is a JSON representation of map with host name as the
7671   * key and value is a map of live node attribute keys to its values
7672   */
7673  @Override // NameNodeMXBean
7674  public String getLiveNodes() {
7675    final Map<String, Map<String,Object>> info = 
7676      new HashMap<String, Map<String,Object>>();
7677    final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
7678    blockManager.getDatanodeManager().fetchDatanodes(live, null, true);
7679    for (DatanodeDescriptor node : live) {
7680      Map<String, Object> innerinfo = ImmutableMap.<String, Object>builder()
7681          .put("infoAddr", node.getInfoAddr())
7682          .put("infoSecureAddr", node.getInfoSecureAddr())
7683          .put("xferaddr", node.getXferAddr())
7684          .put("lastContact", getLastContact(node))
7685          .put("usedSpace", getDfsUsed(node))
7686          .put("adminState", node.getAdminState().toString())
7687          .put("nonDfsUsedSpace", node.getNonDfsUsed())
7688          .put("capacity", node.getCapacity())
7689          .put("numBlocks", node.numBlocks())
7690          .put("version", node.getSoftwareVersion())
7691          .put("used", node.getDfsUsed())
7692          .put("remaining", node.getRemaining())
7693          .put("blockScheduled", node.getBlocksScheduled())
7694          .put("blockPoolUsed", node.getBlockPoolUsed())
7695          .put("blockPoolUsedPercent", node.getBlockPoolUsedPercent())
7696          .put("volfails", node.getVolumeFailures())
7697          .build();
7698
7699      info.put(node.getHostName(), innerinfo);
7700    }
7701    return JSON.toString(info);
7702  }
7703
7704  /**
7705   * Returned information is a JSON representation of map with host name as the
7706   * key and value is a map of dead node attribute keys to its values
7707   */
7708  @Override // NameNodeMXBean
7709  public String getDeadNodes() {
7710    final Map<String, Map<String, Object>> info = 
7711      new HashMap<String, Map<String, Object>>();
7712    final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
7713    blockManager.getDatanodeManager().fetchDatanodes(null, dead, true);
7714    for (DatanodeDescriptor node : dead) {
7715      Map<String, Object> innerinfo = ImmutableMap.<String, Object>builder()
7716          .put("lastContact", getLastContact(node))
7717          .put("decommissioned", node.isDecommissioned())
7718          .put("xferaddr", node.getXferAddr())
7719          .build();
7720      info.put(node.getHostName(), innerinfo);
7721    }
7722    return JSON.toString(info);
7723  }
7724
7725  /**
7726   * Returned information is a JSON representation of map with host name as the
7727   * key and value is a map of decommissioning node attribute keys to its
7728   * values
7729   */
7730  @Override // NameNodeMXBean
7731  public String getDecomNodes() {
7732    final Map<String, Map<String, Object>> info = 
7733      new HashMap<String, Map<String, Object>>();
7734    final List<DatanodeDescriptor> decomNodeList = blockManager.getDatanodeManager(
7735        ).getDecommissioningNodes();
7736    for (DatanodeDescriptor node : decomNodeList) {
7737      Map<String, Object> innerinfo = ImmutableMap
7738          .<String, Object> builder()
7739          .put("xferaddr", node.getXferAddr())
7740          .put("underReplicatedBlocks",
7741              node.decommissioningStatus.getUnderReplicatedBlocks())
7742          .put("decommissionOnlyReplicas",
7743              node.decommissioningStatus.getDecommissionOnlyReplicas())
7744          .put("underReplicateInOpenFiles",
7745              node.decommissioningStatus.getUnderReplicatedInOpenFiles())
7746          .build();
7747      info.put(node.getHostName(), innerinfo);
7748    }
7749    return JSON.toString(info);
7750  }
7751
7752  private long getLastContact(DatanodeDescriptor alivenode) {
7753    return (Time.now() - alivenode.getLastUpdate())/1000;
7754  }
7755
7756  private long getDfsUsed(DatanodeDescriptor alivenode) {
7757    return alivenode.getDfsUsed();
7758  }
7759
7760  @Override  // NameNodeMXBean
7761  public String getClusterId() {
7762    return getFSImage().getStorage().getClusterID();
7763  }
7764  
7765  @Override  // NameNodeMXBean
7766  public String getBlockPoolId() {
7767    return blockPoolId;
7768  }
7769  
7770  @Override  // NameNodeMXBean
7771  public String getNameDirStatuses() {
7772    Map<String, Map<File, StorageDirType>> statusMap =
7773      new HashMap<String, Map<File, StorageDirType>>();
7774    
7775    Map<File, StorageDirType> activeDirs = new HashMap<File, StorageDirType>();
7776    for (Iterator<StorageDirectory> it
7777        = getFSImage().getStorage().dirIterator(); it.hasNext();) {
7778      StorageDirectory st = it.next();
7779      activeDirs.put(st.getRoot(), st.getStorageDirType());
7780    }
7781    statusMap.put("active", activeDirs);
7782    
7783    List<Storage.StorageDirectory> removedStorageDirs
7784        = getFSImage().getStorage().getRemovedStorageDirs();
7785    Map<File, StorageDirType> failedDirs = new HashMap<File, StorageDirType>();
7786    for (StorageDirectory st : removedStorageDirs) {
7787      failedDirs.put(st.getRoot(), st.getStorageDirType());
7788    }
7789    statusMap.put("failed", failedDirs);
7790    
7791    return JSON.toString(statusMap);
7792  }
7793
7794  @Override // NameNodeMXBean
7795  public String getNodeUsage() {
7796    float median = 0;
7797    float max = 0;
7798    float min = 0;
7799    float dev = 0;
7800
7801    final Map<String, Map<String,Object>> info =
7802        new HashMap<String, Map<String,Object>>();
7803    final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
7804    blockManager.getDatanodeManager().fetchDatanodes(live, null, true);
7805
7806    if (live.size() > 0) {
7807      float totalDfsUsed = 0;
7808      float[] usages = new float[live.size()];
7809      int i = 0;
7810      for (DatanodeDescriptor dn : live) {
7811        usages[i++] = dn.getDfsUsedPercent();
7812        totalDfsUsed += dn.getDfsUsedPercent();
7813      }
7814      totalDfsUsed /= live.size();
7815      Arrays.sort(usages);
7816      median = usages[usages.length / 2];
7817      max = usages[usages.length - 1];
7818      min = usages[0];
7819
7820      for (i = 0; i < usages.length; i++) {
7821        dev += (usages[i] - totalDfsUsed) * (usages[i] - totalDfsUsed);
7822      }
7823      dev = (float) Math.sqrt(dev / usages.length);
7824    }
7825
7826    final Map<String, Object> innerInfo = new HashMap<String, Object>();
7827    innerInfo.put("min", StringUtils.format("%.2f%%", min));
7828    innerInfo.put("median", StringUtils.format("%.2f%%", median));
7829    innerInfo.put("max", StringUtils.format("%.2f%%", max));
7830    innerInfo.put("stdDev", StringUtils.format("%.2f%%", dev));
7831    info.put("nodeUsage", innerInfo);
7832
7833    return JSON.toString(info);
7834  }
7835
7836  @Override  // NameNodeMXBean
7837  public String getNameJournalStatus() {
7838    List<Map<String, String>> jasList = new ArrayList<Map<String, String>>();
7839    FSEditLog log = getFSImage().getEditLog();
7840    if (log != null) {
7841      boolean openForWrite = log.isOpenForWrite();
7842      for (JournalAndStream jas : log.getJournals()) {
7843        final Map<String, String> jasMap = new HashMap<String, String>();
7844        String manager = jas.getManager().toString();
7845
7846        jasMap.put("required", String.valueOf(jas.isRequired()));
7847        jasMap.put("disabled", String.valueOf(jas.isDisabled()));
7848        jasMap.put("manager", manager);
7849
7850        if (jas.isDisabled()) {
7851          jasMap.put("stream", "Failed");
7852        } else if (openForWrite) {
7853          EditLogOutputStream elos = jas.getCurrentStream();
7854          if (elos != null) {
7855            jasMap.put("stream", elos.generateReport());
7856          } else {
7857            jasMap.put("stream", "not currently writing");
7858          }
7859        } else {
7860          jasMap.put("stream", "open for read");
7861        }
7862        jasList.add(jasMap);
7863      }
7864    }
7865    return JSON.toString(jasList);
7866  }
7867
7868  @Override // NameNodeMxBean
7869  public String getJournalTransactionInfo() {
7870    Map<String, String> txnIdMap = new HashMap<String, String>();
7871    txnIdMap.put("LastAppliedOrWrittenTxId",
7872        Long.toString(this.getFSImage().getLastAppliedOrWrittenTxId()));
7873    txnIdMap.put("MostRecentCheckpointTxId",
7874        Long.toString(this.getFSImage().getMostRecentCheckpointTxId()));
7875    return JSON.toString(txnIdMap);
7876  }
7877  
7878  @Override  // NameNodeMXBean
7879  public String getNNStarted() {
7880    return getStartTime().toString();
7881  }
7882
7883  @Override  // NameNodeMXBean
7884  public String getCompileInfo() {
7885    return VersionInfo.getDate() + " by " + VersionInfo.getUser() +
7886        " from " + VersionInfo.getBranch();
7887  }
7888
7889  /** @return the block manager. */
7890  public BlockManager getBlockManager() {
7891    return blockManager;
7892  }
7893  /** @return the FSDirectory. */
7894  public FSDirectory getFSDirectory() {
7895    return dir;
7896  }
7897  /** Set the FSDirectory. */
7898  @VisibleForTesting
7899  public void setFSDirectory(FSDirectory dir) {
7900    this.dir = dir;
7901  }
7902  /** @return the cache manager. */
7903  public CacheManager getCacheManager() {
7904    return cacheManager;
7905  }
7906
7907  @Override  // NameNodeMXBean
7908  public String getCorruptFiles() {
7909    List<String> list = new ArrayList<String>();
7910    Collection<FSNamesystem.CorruptFileBlockInfo> corruptFileBlocks;
7911    try {
7912      corruptFileBlocks = listCorruptFileBlocks("/", null);
7913      int corruptFileCount = corruptFileBlocks.size();
7914      if (corruptFileCount != 0) {
7915        for (FSNamesystem.CorruptFileBlockInfo c : corruptFileBlocks) {
7916          list.add(c.toString());
7917        }
7918      }
7919    } catch (IOException e) {
7920      LOG.warn("Get corrupt file blocks returned error: " + e.getMessage());
7921    }
7922    return JSON.toString(list);
7923  }
7924
7925  @Override  //NameNodeMXBean
7926  public int getDistinctVersionCount() {
7927    return blockManager.getDatanodeManager().getDatanodesSoftwareVersions()
7928      .size();
7929  }
7930
7931  @Override  //NameNodeMXBean
7932  public Map<String, Integer> getDistinctVersions() {
7933    return blockManager.getDatanodeManager().getDatanodesSoftwareVersions();
7934  }
7935
7936  @Override  //NameNodeMXBean
7937  public String getSoftwareVersion() {
7938    return VersionInfo.getVersion();
7939  }
7940
7941  /**
7942   * Verifies that the given identifier and password are valid and match.
7943   * @param identifier Token identifier.
7944   * @param password Password in the token.
7945   */
7946  public synchronized void verifyToken(DelegationTokenIdentifier identifier,
7947      byte[] password) throws InvalidToken, RetriableException {
7948    try {
7949      getDelegationTokenSecretManager().verifyToken(identifier, password);
7950    } catch (InvalidToken it) {
7951      if (inTransitionToActive()) {
7952        throw new RetriableException(it);
7953      }
7954      throw it;
7955    }
7956  }
7957  
7958  @Override
7959  public boolean isGenStampInFuture(Block block) {
7960    if (isLegacyBlock(block)) {
7961      return block.getGenerationStamp() > getGenerationStampV1();
7962    } else {
7963      return block.getGenerationStamp() > getGenerationStampV2();
7964    }
7965  }
7966
7967  @VisibleForTesting
7968  public EditLogTailer getEditLogTailer() {
7969    return editLogTailer;
7970  }
7971  
7972  @VisibleForTesting
7973  public void setEditLogTailerForTests(EditLogTailer tailer) {
7974    this.editLogTailer = tailer;
7975  }
7976  
7977  @VisibleForTesting
7978  void setFsLockForTests(ReentrantReadWriteLock lock) {
7979    this.fsLock.coarseLock = lock;
7980  }
7981  
7982  @VisibleForTesting
7983  public ReentrantReadWriteLock getFsLockForTests() {
7984    return fsLock.coarseLock;
7985  }
7986  
7987  @VisibleForTesting
7988  public ReentrantLock getLongReadLockForTests() {
7989    return fsLock.longReadLock;
7990  }
7991
7992  @VisibleForTesting
7993  public SafeModeInfo getSafeModeInfoForTests() {
7994    return safeMode;
7995  }
7996  
7997  @VisibleForTesting
7998  public void setNNResourceChecker(NameNodeResourceChecker nnResourceChecker) {
7999    this.nnResourceChecker = nnResourceChecker;
8000  }
8001
8002  @Override
8003  public boolean isAvoidingStaleDataNodesForWrite() {
8004    return this.blockManager.getDatanodeManager()
8005        .shouldAvoidStaleDataNodesForWrite();
8006  }
8007
8008  @Override // FSClusterStats
8009  public int getNumDatanodesInService() {
8010    return datanodeStatistics.getNumDatanodesInService();
8011  }
8012  
8013  @Override // for block placement strategy
8014  public double getInServiceXceiverAverage() {
8015    double avgLoad = 0;
8016    final int nodes = getNumDatanodesInService();
8017    if (nodes != 0) {
8018      final int xceivers = datanodeStatistics.getInServiceXceiverCount();
8019      avgLoad = (double)xceivers/nodes;
8020    }
8021    return avgLoad;
8022  }
8023
8024  public SnapshotManager getSnapshotManager() {
8025    return snapshotManager;
8026  }
8027  
8028  /** Allow snapshot on a directory. */
8029  void allowSnapshot(String path) throws SafeModeException, IOException {
8030    checkOperation(OperationCategory.WRITE);
8031    writeLock();
8032    try {
8033      checkOperation(OperationCategory.WRITE);
8034      checkNameNodeSafeMode("Cannot allow snapshot for " + path);
8035      checkSuperuserPrivilege();
8036
8037      dir.writeLock();
8038      try {
8039        snapshotManager.setSnapshottable(path, true);
8040      } finally {
8041        dir.writeUnlock();
8042      }
8043      getEditLog().logAllowSnapshot(path);
8044    } finally {
8045      writeUnlock();
8046    }
8047    getEditLog().logSync();
8048
8049    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
8050      logAuditEvent(true, "allowSnapshot", path, null, null);
8051    }
8052  }
8053  
8054  /** Disallow snapshot on a directory. */
8055  void disallowSnapshot(String path) throws SafeModeException, IOException {
8056    checkOperation(OperationCategory.WRITE);
8057    writeLock();
8058    try {
8059      checkOperation(OperationCategory.WRITE);
8060      checkNameNodeSafeMode("Cannot disallow snapshot for " + path);
8061      checkSuperuserPrivilege();
8062
8063      dir.writeLock();
8064      try {
8065        snapshotManager.resetSnapshottable(path);
8066      } finally {
8067        dir.writeUnlock();
8068      }
8069      getEditLog().logDisallowSnapshot(path);
8070    } finally {
8071      writeUnlock();
8072    }
8073    getEditLog().logSync();
8074    
8075    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
8076      logAuditEvent(true, "disallowSnapshot", path, null, null);
8077    }
8078  }
8079  
8080  /**
8081   * Create a snapshot
8082   * @param snapshotRoot The directory path where the snapshot is taken
8083   * @param snapshotName The name of the snapshot
8084   */
8085  String createSnapshot(String snapshotRoot, String snapshotName)
8086      throws SafeModeException, IOException {
8087    checkOperation(OperationCategory.WRITE);
8088    final FSPermissionChecker pc = getPermissionChecker();
8089    CacheEntryWithPayload cacheEntry = RetryCache.waitForCompletion(retryCache,
8090        null);
8091    if (cacheEntry != null && cacheEntry.isSuccess()) {
8092      return (String) cacheEntry.getPayload();
8093    }
8094    String snapshotPath = null;
8095    writeLock();
8096    try {
8097      checkOperation(OperationCategory.WRITE);
8098      checkNameNodeSafeMode("Cannot create snapshot for " + snapshotRoot);
8099      if (isPermissionEnabled) {
8100        checkOwner(pc, snapshotRoot);
8101      }
8102
8103      if (snapshotName == null || snapshotName.isEmpty()) {
8104        snapshotName = Snapshot.generateDefaultSnapshotName();
8105      }
8106      if(snapshotName != null){
8107        if (!DFSUtil.isValidNameForComponent(snapshotName)) {
8108            throw new InvalidPathException("Invalid snapshot name: "
8109                + snapshotName);
8110        }
8111      }
8112      dir.verifySnapshotName(snapshotName, snapshotRoot);
8113      dir.writeLock();
8114      try {
8115        snapshotPath = snapshotManager.createSnapshot(snapshotRoot, snapshotName);
8116      } finally {
8117        dir.writeUnlock();
8118      }
8119      getEditLog().logCreateSnapshot(snapshotRoot, snapshotName,
8120          cacheEntry != null);
8121    } finally {
8122      writeUnlock();
8123      RetryCache.setState(cacheEntry, snapshotPath != null, snapshotPath);
8124    }
8125    getEditLog().logSync();
8126    
8127    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
8128      logAuditEvent(true, "createSnapshot", snapshotRoot, snapshotPath, null);
8129    }
8130    return snapshotPath;
8131  }
8132  
8133  /**
8134   * Rename a snapshot
8135   * @param path The directory path where the snapshot was taken
8136   * @param snapshotOldName Old snapshot name
8137   * @param snapshotNewName New snapshot name
8138   * @throws SafeModeException
8139   * @throws IOException 
8140   */
8141  void renameSnapshot(String path, String snapshotOldName,
8142      String snapshotNewName) throws SafeModeException, IOException {
8143    final FSPermissionChecker pc = getPermissionChecker();
8144    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
8145    if (cacheEntry != null && cacheEntry.isSuccess()) {
8146      return; // Return previous response
8147    }
8148    writeLock();
8149    boolean success = false;
8150    try {
8151      checkOperation(OperationCategory.WRITE);
8152      checkNameNodeSafeMode("Cannot rename snapshot for " + path);
8153      if (isPermissionEnabled) {
8154        checkOwner(pc, path);
8155      }
8156      dir.verifySnapshotName(snapshotNewName, path);
8157      
8158      snapshotManager.renameSnapshot(path, snapshotOldName, snapshotNewName);
8159      getEditLog().logRenameSnapshot(path, snapshotOldName, snapshotNewName,
8160          cacheEntry != null);
8161      success = true;
8162    } finally {
8163      writeUnlock();
8164      RetryCache.setState(cacheEntry, success);
8165    }
8166    getEditLog().logSync();
8167    
8168    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
8169      String oldSnapshotRoot = Snapshot.getSnapshotPath(path, snapshotOldName);
8170      String newSnapshotRoot = Snapshot.getSnapshotPath(path, snapshotNewName);
8171      logAuditEvent(true, "renameSnapshot", oldSnapshotRoot, newSnapshotRoot, null);
8172    }
8173  }
8174  
8175  /**
8176   * Get the list of snapshottable directories that are owned 
8177   * by the current user. Return all the snapshottable directories if the 
8178   * current user is a super user.
8179   * @return The list of all the current snapshottable directories
8180   * @throws IOException
8181   */
8182  public SnapshottableDirectoryStatus[] getSnapshottableDirListing()
8183      throws IOException {
8184    SnapshottableDirectoryStatus[] status = null;
8185    checkOperation(OperationCategory.READ);
8186    final FSPermissionChecker checker = getPermissionChecker();
8187    readLock();
8188    try {
8189      checkOperation(OperationCategory.READ);
8190      final String user = checker.isSuperUser()? null : checker.getUser();
8191      status = snapshotManager.getSnapshottableDirListing(user);
8192    } finally {
8193      readUnlock();
8194    }
8195    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
8196      logAuditEvent(true, "listSnapshottableDirectory", null, null, null);
8197    }
8198    return status;
8199  }
8200  
8201  /**
8202   * Get the difference between two snapshots (or between a snapshot and the
8203   * current status) of a snapshottable directory.
8204   * 
8205   * @param path The full path of the snapshottable directory.
8206   * @param fromSnapshot Name of the snapshot to calculate the diff from. Null
8207   *          or empty string indicates the current tree.
8208   * @param toSnapshot Name of the snapshot to calculated the diff to. Null or
8209   *          empty string indicates the current tree.
8210   * @return A report about the difference between {@code fromSnapshot} and 
8211   *         {@code toSnapshot}. Modified/deleted/created/renamed files and 
8212   *         directories belonging to the snapshottable directories are listed 
8213   *         and labeled as M/-/+/R respectively. 
8214   * @throws IOException
8215   */
8216  SnapshotDiffReport getSnapshotDiffReport(String path,
8217      String fromSnapshot, String toSnapshot) throws IOException {
8218    SnapshotDiffReport diffs;
8219    checkOperation(OperationCategory.READ);
8220    final FSPermissionChecker pc = getPermissionChecker();
8221    readLock();
8222    try {
8223      checkOperation(OperationCategory.READ);
8224      if (isPermissionEnabled) {
8225        checkSubtreeReadPermission(pc, path, fromSnapshot);
8226        checkSubtreeReadPermission(pc, path, toSnapshot);
8227      }
8228      diffs = snapshotManager.diff(path, fromSnapshot, toSnapshot);
8229    } finally {
8230      readUnlock();
8231    }
8232
8233    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
8234      logAuditEvent(true, "computeSnapshotDiff", null, null, null);
8235    }
8236    return diffs;
8237  }
8238  
8239  private void checkSubtreeReadPermission(final FSPermissionChecker pc,
8240      final String snapshottablePath, final String snapshot)
8241          throws AccessControlException, UnresolvedLinkException {
8242    final String fromPath = snapshot == null?
8243        snapshottablePath: Snapshot.getSnapshotPath(snapshottablePath, snapshot);
8244    checkPermission(pc, fromPath, false, null, null, FsAction.READ, FsAction.READ);
8245  }
8246  
8247  /**
8248   * Delete a snapshot of a snapshottable directory
8249   * @param snapshotRoot The snapshottable directory
8250   * @param snapshotName The name of the to-be-deleted snapshot
8251   * @throws SafeModeException
8252   * @throws IOException
8253   */
8254  void deleteSnapshot(String snapshotRoot, String snapshotName)
8255      throws SafeModeException, IOException {
8256    final FSPermissionChecker pc = getPermissionChecker();
8257    
8258    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
8259    if (cacheEntry != null && cacheEntry.isSuccess()) {
8260      return; // Return previous response
8261    }
8262    boolean success = false;
8263    BlocksMapUpdateInfo collectedBlocks = new BlocksMapUpdateInfo();
8264    writeLock();
8265    try {
8266      checkOperation(OperationCategory.WRITE);
8267      checkNameNodeSafeMode("Cannot delete snapshot for " + snapshotRoot);
8268      if (isPermissionEnabled) {
8269        checkOwner(pc, snapshotRoot);
8270      }
8271
8272      List<INode> removedINodes = new ChunkedArrayList<INode>();
8273      dir.writeLock();
8274      try {
8275        snapshotManager.deleteSnapshot(snapshotRoot, snapshotName,
8276            collectedBlocks, removedINodes);
8277        dir.removeFromInodeMap(removedINodes);
8278      } finally {
8279        dir.writeUnlock();
8280      }
8281      removedINodes.clear();
8282      getEditLog().logDeleteSnapshot(snapshotRoot, snapshotName,
8283          cacheEntry != null);
8284      success = true;
8285    } finally {
8286      writeUnlock();
8287      RetryCache.setState(cacheEntry, success);
8288    }
8289    getEditLog().logSync();
8290
8291    removeBlocks(collectedBlocks);
8292    collectedBlocks.clear();
8293
8294    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
8295      String rootPath = Snapshot.getSnapshotPath(snapshotRoot, snapshotName);
8296      logAuditEvent(true, "deleteSnapshot", rootPath, null, null);
8297    }
8298  }
8299
8300  /**
8301   * Remove a list of INodeDirectorySnapshottable from the SnapshotManager
8302   * @param toRemove the list of INodeDirectorySnapshottable to be removed
8303   */
8304  void removeSnapshottableDirs(List<INodeDirectory> toRemove) {
8305    if (snapshotManager != null) {
8306      snapshotManager.removeSnapshottable(toRemove);
8307    }
8308  }
8309
8310  RollingUpgradeInfo queryRollingUpgrade() throws IOException {
8311    checkSuperuserPrivilege();
8312    checkOperation(OperationCategory.READ);
8313    readLock();
8314    try {
8315      if (rollingUpgradeInfo != null) {
8316        boolean hasRollbackImage = this.getFSImage().hasRollbackFSImage();
8317        rollingUpgradeInfo.setCreatedRollbackImages(hasRollbackImage);
8318      }
8319      return rollingUpgradeInfo;
8320    } finally {
8321      readUnlock();
8322    }
8323  }
8324
8325  RollingUpgradeInfo startRollingUpgrade() throws IOException {
8326    checkSuperuserPrivilege();
8327    checkOperation(OperationCategory.WRITE);
8328    writeLock();
8329    try {
8330      checkOperation(OperationCategory.WRITE);
8331      if (isRollingUpgrade()) {
8332        return rollingUpgradeInfo;
8333      }
8334      long startTime = now();
8335      if (!haEnabled) { // for non-HA, we require NN to be in safemode
8336        startRollingUpgradeInternalForNonHA(startTime);
8337      } else { // for HA, NN cannot be in safemode
8338        checkNameNodeSafeMode("Failed to start rolling upgrade");
8339        startRollingUpgradeInternal(startTime);
8340      }
8341
8342      getEditLog().logStartRollingUpgrade(rollingUpgradeInfo.getStartTime());
8343      if (haEnabled) {
8344        // roll the edit log to make sure the standby NameNode can tail
8345        getFSImage().rollEditLog();
8346      }
8347    } finally {
8348      writeUnlock();
8349    }
8350
8351    getEditLog().logSync();
8352    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
8353      logAuditEvent(true, "startRollingUpgrade", null, null, null);
8354    }
8355    return rollingUpgradeInfo;
8356  }
8357
8358  /**
8359   * Update internal state to indicate that a rolling upgrade is in progress.
8360   * @param startTime rolling upgrade start time
8361   */
8362  void startRollingUpgradeInternal(long startTime)
8363      throws IOException {
8364    checkRollingUpgrade("start rolling upgrade");
8365    getFSImage().checkUpgrade();
8366    setRollingUpgradeInfo(false, startTime);
8367  }
8368
8369  /**
8370   * Update internal state to indicate that a rolling upgrade is in progress for
8371   * non-HA setup. This requires the namesystem is in SafeMode and after doing a
8372   * checkpoint for rollback the namesystem will quit the safemode automatically 
8373   */
8374  private void startRollingUpgradeInternalForNonHA(long startTime)
8375      throws IOException {
8376    Preconditions.checkState(!haEnabled);
8377    if (!isInSafeMode()) {
8378      throw new IOException("Safe mode should be turned ON "
8379          + "in order to create namespace image.");
8380    }
8381    checkRollingUpgrade("start rolling upgrade");
8382    getFSImage().checkUpgrade();
8383    // in non-HA setup, we do an extra checkpoint to generate a rollback image
8384    getFSImage().saveNamespace(this, NameNodeFile.IMAGE_ROLLBACK, null);
8385    LOG.info("Successfully saved namespace for preparing rolling upgrade.");
8386
8387    // leave SafeMode automatically
8388    setSafeMode(SafeModeAction.SAFEMODE_LEAVE);
8389    setRollingUpgradeInfo(true, startTime);
8390  }
8391
8392  void setRollingUpgradeInfo(boolean createdRollbackImages, long startTime) {
8393    rollingUpgradeInfo = new RollingUpgradeInfo(blockPoolId,
8394        createdRollbackImages, startTime, 0L);
8395  }
8396
8397  public void setCreatedRollbackImages(boolean created) {
8398    if (rollingUpgradeInfo != null) {
8399      rollingUpgradeInfo.setCreatedRollbackImages(created);
8400    }
8401  }
8402
8403  public RollingUpgradeInfo getRollingUpgradeInfo() {
8404    return rollingUpgradeInfo;
8405  }
8406
8407  public boolean isNeedRollbackFsImage() {
8408    return needRollbackFsImage;
8409  }
8410
8411  public void setNeedRollbackFsImage(boolean needRollbackFsImage) {
8412    this.needRollbackFsImage = needRollbackFsImage;
8413  }
8414
8415  @Override  // NameNodeMXBean
8416  public RollingUpgradeInfo.Bean getRollingUpgradeStatus() {
8417    if (!isRollingUpgrade()) {
8418      return null;
8419    }
8420    RollingUpgradeInfo upgradeInfo = getRollingUpgradeInfo();
8421    if (upgradeInfo.createdRollbackImages()) {
8422      return new RollingUpgradeInfo.Bean(upgradeInfo);
8423    }
8424    readLock();
8425    try {
8426      // check again after acquiring the read lock.
8427      upgradeInfo = getRollingUpgradeInfo();
8428      if (upgradeInfo == null) {
8429        return null;
8430      }
8431      if (!upgradeInfo.createdRollbackImages()) {
8432        boolean hasRollbackImage = this.getFSImage().hasRollbackFSImage();
8433        upgradeInfo.setCreatedRollbackImages(hasRollbackImage);
8434      }
8435    } catch (IOException ioe) {
8436      LOG.warn("Encountered exception setting Rollback Image", ioe);
8437    } finally {
8438      readUnlock();
8439    }
8440    return new RollingUpgradeInfo.Bean(upgradeInfo);
8441  }
8442
8443  /** Is rolling upgrade in progress? */
8444  public boolean isRollingUpgrade() {
8445    return rollingUpgradeInfo != null;
8446  }
8447
8448  void checkRollingUpgrade(String action) throws RollingUpgradeException {
8449    if (isRollingUpgrade()) {
8450      throw new RollingUpgradeException("Failed to " + action
8451          + " since a rolling upgrade is already in progress."
8452          + " Existing rolling upgrade info:\n" + rollingUpgradeInfo);
8453    }
8454  }
8455
8456  void finalizeRollingUpgrade() throws IOException {
8457    checkSuperuserPrivilege();
8458    checkOperation(OperationCategory.WRITE);
8459    writeLock();
8460    final RollingUpgradeInfo returnInfo;
8461    try {
8462      checkOperation(OperationCategory.WRITE);
8463      if (!isRollingUpgrade()) {
8464        return;
8465      }
8466      checkNameNodeSafeMode("Failed to finalize rolling upgrade");
8467
8468      returnInfo = finalizeRollingUpgradeInternal(now());
8469      getEditLog().logFinalizeRollingUpgrade(returnInfo.getFinalizeTime());
8470      if (haEnabled) {
8471        // roll the edit log to make sure the standby NameNode can tail
8472        getFSImage().rollEditLog();
8473      }
8474      getFSImage().updateStorageVersion();
8475      getFSImage().renameCheckpoint(NameNodeFile.IMAGE_ROLLBACK,
8476          NameNodeFile.IMAGE);
8477    } finally {
8478      writeUnlock();
8479    }
8480
8481    if (!haEnabled) {
8482      // Sync not needed for ha since the edit was rolled after logging.
8483      getEditLog().logSync();
8484    }
8485
8486    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
8487      logAuditEvent(true, "finalizeRollingUpgrade", null, null, null);
8488    }
8489    return;
8490  }
8491
8492  RollingUpgradeInfo finalizeRollingUpgradeInternal(long finalizeTime)
8493      throws RollingUpgradeException {
8494    final long startTime = rollingUpgradeInfo.getStartTime();
8495    rollingUpgradeInfo = null;
8496    return new RollingUpgradeInfo(blockPoolId, false, startTime, finalizeTime);
8497  }
8498
8499  long addCacheDirective(CacheDirectiveInfo directive, EnumSet<CacheFlag> flags)
8500      throws IOException {
8501    final FSPermissionChecker pc = isPermissionEnabled ?
8502        getPermissionChecker() : null;
8503    CacheEntryWithPayload cacheEntry =
8504        RetryCache.waitForCompletion(retryCache, null);
8505    if (cacheEntry != null && cacheEntry.isSuccess()) {
8506      return (Long) cacheEntry.getPayload();
8507    }
8508    boolean success = false;
8509    if (!flags.contains(CacheFlag.FORCE)) {
8510      cacheManager.waitForRescanIfNeeded();
8511    }
8512    writeLock();
8513    String effectiveDirectiveStr = null;
8514    Long result = null;
8515    try {
8516      checkOperation(OperationCategory.WRITE);
8517      if (isInSafeMode()) {
8518        throw new SafeModeException(
8519            "Cannot add cache directive", safeMode);
8520      }
8521      if (directive.getId() != null) {
8522        throw new IOException("addDirective: you cannot specify an ID " +
8523            "for this operation.");
8524      }
8525      CacheDirectiveInfo effectiveDirective =
8526          cacheManager.addDirective(directive, pc, flags);
8527      getEditLog().logAddCacheDirectiveInfo(effectiveDirective,
8528          cacheEntry != null);
8529      result = effectiveDirective.getId();
8530      effectiveDirectiveStr = effectiveDirective.toString();
8531      success = true;
8532    } finally {
8533      writeUnlock();
8534      if (success) {
8535        getEditLog().logSync();
8536      }
8537      if (isAuditEnabled() && isExternalInvocation()) {
8538        logAuditEvent(success, "addCacheDirective", effectiveDirectiveStr, null, null);
8539      }
8540      RetryCache.setState(cacheEntry, success, result);
8541    }
8542    return result;
8543  }
8544
8545  void modifyCacheDirective(CacheDirectiveInfo directive,
8546      EnumSet<CacheFlag> flags) throws IOException {
8547    final FSPermissionChecker pc = isPermissionEnabled ?
8548        getPermissionChecker() : null;
8549    boolean success = false;
8550    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
8551    if (cacheEntry != null && cacheEntry.isSuccess()) {
8552      return;
8553    }
8554    if (!flags.contains(CacheFlag.FORCE)) {
8555      cacheManager.waitForRescanIfNeeded();
8556    }
8557    writeLock();
8558    try {
8559      checkOperation(OperationCategory.WRITE);
8560      if (isInSafeMode()) {
8561        throw new SafeModeException(
8562            "Cannot add cache directive", safeMode);
8563      }
8564      cacheManager.modifyDirective(directive, pc, flags);
8565      getEditLog().logModifyCacheDirectiveInfo(directive,
8566          cacheEntry != null);
8567      success = true;
8568    } finally {
8569      writeUnlock();
8570      if (success) {
8571        getEditLog().logSync();
8572      }
8573      if (isAuditEnabled() && isExternalInvocation()) {
8574        String idStr = "{id: " + directive.getId().toString() + "}";
8575        logAuditEvent(success, "modifyCacheDirective", idStr, directive.toString(), null);
8576      }
8577      RetryCache.setState(cacheEntry, success);
8578    }
8579  }
8580
8581  void removeCacheDirective(Long id) throws IOException {
8582    final FSPermissionChecker pc = isPermissionEnabled ?
8583        getPermissionChecker() : null;
8584    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
8585    if (cacheEntry != null && cacheEntry.isSuccess()) {
8586      return;
8587    }
8588    boolean success = false;
8589    writeLock();
8590    try {
8591      checkOperation(OperationCategory.WRITE);
8592      if (isInSafeMode()) {
8593        throw new SafeModeException(
8594            "Cannot remove cache directives", safeMode);
8595      }
8596      cacheManager.removeDirective(id, pc);
8597      getEditLog().logRemoveCacheDirectiveInfo(id, cacheEntry != null);
8598      success = true;
8599    } finally {
8600      writeUnlock();
8601      if (isAuditEnabled() && isExternalInvocation()) {
8602        String idStr = "{id: " + id.toString() + "}";
8603        logAuditEvent(success, "removeCacheDirective", idStr, null,
8604            null);
8605      }
8606      RetryCache.setState(cacheEntry, success);
8607    }
8608    getEditLog().logSync();
8609  }
8610
8611  BatchedListEntries<CacheDirectiveEntry> listCacheDirectives(
8612      long startId, CacheDirectiveInfo filter) throws IOException {
8613    checkOperation(OperationCategory.READ);
8614    final FSPermissionChecker pc = isPermissionEnabled ?
8615        getPermissionChecker() : null;
8616    BatchedListEntries<CacheDirectiveEntry> results;
8617    cacheManager.waitForRescanIfNeeded();
8618    readLock();
8619    boolean success = false;
8620    try {
8621      checkOperation(OperationCategory.READ);
8622      results =
8623          cacheManager.listCacheDirectives(startId, filter, pc);
8624      success = true;
8625    } finally {
8626      readUnlock();
8627      if (isAuditEnabled() && isExternalInvocation()) {
8628        logAuditEvent(success, "listCacheDirectives", filter.toString(), null,
8629            null);
8630      }
8631    }
8632    return results;
8633  }
8634
8635  public void addCachePool(CachePoolInfo req) throws IOException {
8636    final FSPermissionChecker pc = isPermissionEnabled ?
8637        getPermissionChecker() : null;
8638    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
8639    if (cacheEntry != null && cacheEntry.isSuccess()) {
8640      return; // Return previous response
8641    }
8642    writeLock();
8643    boolean success = false;
8644    String poolInfoStr = null;
8645    try {
8646      checkOperation(OperationCategory.WRITE);
8647      if (isInSafeMode()) {
8648        throw new SafeModeException(
8649            "Cannot add cache pool " + req.getPoolName(), safeMode);
8650      }
8651      if (pc != null) {
8652        pc.checkSuperuserPrivilege();
8653      }
8654      CachePoolInfo info = cacheManager.addCachePool(req);
8655      poolInfoStr = info.toString();
8656      getEditLog().logAddCachePool(info, cacheEntry != null);
8657      success = true;
8658    } finally {
8659      writeUnlock();
8660      if (isAuditEnabled() && isExternalInvocation()) {
8661        logAuditEvent(success, "addCachePool", poolInfoStr, null, null);
8662      }
8663      RetryCache.setState(cacheEntry, success);
8664    }
8665    
8666    getEditLog().logSync();
8667  }
8668
8669  public void modifyCachePool(CachePoolInfo req) throws IOException {
8670    final FSPermissionChecker pc =
8671        isPermissionEnabled ? getPermissionChecker() : null;
8672    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
8673    if (cacheEntry != null && cacheEntry.isSuccess()) {
8674      return; // Return previous response
8675    }
8676    writeLock();
8677    boolean success = false;
8678    try {
8679      checkOperation(OperationCategory.WRITE);
8680      if (isInSafeMode()) {
8681        throw new SafeModeException(
8682            "Cannot modify cache pool " + req.getPoolName(), safeMode);
8683      }
8684      if (pc != null) {
8685        pc.checkSuperuserPrivilege();
8686      }
8687      cacheManager.modifyCachePool(req);
8688      getEditLog().logModifyCachePool(req, cacheEntry != null);
8689      success = true;
8690    } finally {
8691      writeUnlock();
8692      if (isAuditEnabled() && isExternalInvocation()) {
8693        String poolNameStr = "{poolName: " + req.getPoolName() + "}";
8694        logAuditEvent(success, "modifyCachePool", poolNameStr, req.toString(), null);
8695      }
8696      RetryCache.setState(cacheEntry, success);
8697    }
8698
8699    getEditLog().logSync();
8700  }
8701
8702  public void removeCachePool(String cachePoolName) throws IOException {
8703    final FSPermissionChecker pc =
8704        isPermissionEnabled ? getPermissionChecker() : null;
8705    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
8706    if (cacheEntry != null && cacheEntry.isSuccess()) {
8707      return; // Return previous response
8708    }
8709    writeLock();
8710    boolean success = false;
8711    try {
8712      checkOperation(OperationCategory.WRITE);
8713      if (isInSafeMode()) {
8714        throw new SafeModeException(
8715            "Cannot remove cache pool " + cachePoolName, safeMode);
8716      }
8717      if (pc != null) {
8718        pc.checkSuperuserPrivilege();
8719      }
8720      cacheManager.removeCachePool(cachePoolName);
8721      getEditLog().logRemoveCachePool(cachePoolName, cacheEntry != null);
8722      success = true;
8723    } finally {
8724      writeUnlock();
8725      if (isAuditEnabled() && isExternalInvocation()) {
8726        String poolNameStr = "{poolName: " + cachePoolName + "}";
8727        logAuditEvent(success, "removeCachePool", poolNameStr, null, null);
8728      }
8729      RetryCache.setState(cacheEntry, success);
8730    }
8731    
8732    getEditLog().logSync();
8733  }
8734
8735  public BatchedListEntries<CachePoolEntry> listCachePools(String prevKey)
8736      throws IOException {
8737    final FSPermissionChecker pc =
8738        isPermissionEnabled ? getPermissionChecker() : null;
8739    BatchedListEntries<CachePoolEntry> results;
8740    checkOperation(OperationCategory.READ);
8741    boolean success = false;
8742    cacheManager.waitForRescanIfNeeded();
8743    readLock();
8744    try {
8745      checkOperation(OperationCategory.READ);
8746      results = cacheManager.listCachePools(pc, prevKey);
8747      success = true;
8748    } finally {
8749      readUnlock();
8750      if (isAuditEnabled() && isExternalInvocation()) {
8751        logAuditEvent(success, "listCachePools", null, null, null);
8752      }
8753    }
8754    return results;
8755  }
8756
8757  void modifyAclEntries(final String srcArg, List<AclEntry> aclSpec)
8758      throws IOException {
8759    String src = srcArg;
8760    nnConf.checkAclsConfigFlag();
8761    HdfsFileStatus resultingStat = null;
8762    FSPermissionChecker pc = getPermissionChecker();
8763    checkOperation(OperationCategory.WRITE);
8764    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
8765    writeLock();
8766    try {
8767      checkOperation(OperationCategory.WRITE);
8768      checkNameNodeSafeMode("Cannot modify ACL entries on " + src);
8769      src = resolvePath(src, pathComponents);
8770      checkOwner(pc, src);
8771      List<AclEntry> newAcl = dir.modifyAclEntries(src, aclSpec);
8772      getEditLog().logSetAcl(src, newAcl);
8773      resultingStat = getAuditFileInfo(src, false);
8774    } catch (AccessControlException e) {
8775      logAuditEvent(false, "modifyAclEntries", srcArg);
8776      throw e;
8777    } finally {
8778      writeUnlock();
8779    }
8780    getEditLog().logSync();
8781    logAuditEvent(true, "modifyAclEntries", srcArg, null, resultingStat);
8782  }
8783
8784  void removeAclEntries(final String srcArg, List<AclEntry> aclSpec)
8785      throws IOException {
8786    String src = srcArg;
8787    nnConf.checkAclsConfigFlag();
8788    HdfsFileStatus resultingStat = null;
8789    FSPermissionChecker pc = getPermissionChecker();
8790    checkOperation(OperationCategory.WRITE);
8791    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
8792    writeLock();
8793    try {
8794      checkOperation(OperationCategory.WRITE);
8795      checkNameNodeSafeMode("Cannot remove ACL entries on " + src);
8796      src = resolvePath(src, pathComponents);
8797      checkOwner(pc, src);
8798      List<AclEntry> newAcl = dir.removeAclEntries(src, aclSpec);
8799      getEditLog().logSetAcl(src, newAcl);
8800      resultingStat = getAuditFileInfo(src, false);
8801    } catch (AccessControlException e) {
8802      logAuditEvent(false, "removeAclEntries", srcArg);
8803      throw e;
8804    } finally {
8805      writeUnlock();
8806    }
8807    getEditLog().logSync();
8808    logAuditEvent(true, "removeAclEntries", srcArg, null, resultingStat);
8809  }
8810
8811  void removeDefaultAcl(final String srcArg) throws IOException {
8812    String src = srcArg;
8813    nnConf.checkAclsConfigFlag();
8814    HdfsFileStatus resultingStat = null;
8815    FSPermissionChecker pc = getPermissionChecker();
8816    checkOperation(OperationCategory.WRITE);
8817    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
8818    writeLock();
8819    try {
8820      checkOperation(OperationCategory.WRITE);
8821      checkNameNodeSafeMode("Cannot remove default ACL entries on " + src);
8822      src = resolvePath(src, pathComponents);
8823      checkOwner(pc, src);
8824      List<AclEntry> newAcl = dir.removeDefaultAcl(src);
8825      getEditLog().logSetAcl(src, newAcl);
8826      resultingStat = getAuditFileInfo(src, false);
8827    } catch (AccessControlException e) {
8828      logAuditEvent(false, "removeDefaultAcl", srcArg);
8829      throw e;
8830    } finally {
8831      writeUnlock();
8832    }
8833    getEditLog().logSync();
8834    logAuditEvent(true, "removeDefaultAcl", srcArg, null, resultingStat);
8835  }
8836
8837  void removeAcl(final String srcArg) throws IOException {
8838    String src = srcArg;
8839    nnConf.checkAclsConfigFlag();
8840    HdfsFileStatus resultingStat = null;
8841    FSPermissionChecker pc = getPermissionChecker();
8842    checkOperation(OperationCategory.WRITE);
8843    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
8844    writeLock();
8845    try {
8846      checkOperation(OperationCategory.WRITE);
8847      checkNameNodeSafeMode("Cannot remove ACL on " + src);
8848      src = resolvePath(src, pathComponents);
8849      checkOwner(pc, src);
8850      dir.removeAcl(src);
8851      getEditLog().logSetAcl(src, AclFeature.EMPTY_ENTRY_LIST);
8852      resultingStat = getAuditFileInfo(src, false);
8853    } catch (AccessControlException e) {
8854      logAuditEvent(false, "removeAcl", srcArg);
8855      throw e;
8856    } finally {
8857      writeUnlock();
8858    }
8859    getEditLog().logSync();
8860    logAuditEvent(true, "removeAcl", srcArg, null, resultingStat);
8861  }
8862
8863  void setAcl(final String srcArg, List<AclEntry> aclSpec) throws IOException {
8864    String src = srcArg;
8865    nnConf.checkAclsConfigFlag();
8866    HdfsFileStatus resultingStat = null;
8867    FSPermissionChecker pc = getPermissionChecker();
8868    checkOperation(OperationCategory.WRITE);
8869    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
8870    writeLock();
8871    try {
8872      checkOperation(OperationCategory.WRITE);
8873      checkNameNodeSafeMode("Cannot set ACL on " + src);
8874      src = resolvePath(src, pathComponents);
8875      checkOwner(pc, src);
8876      List<AclEntry> newAcl = dir.setAcl(src, aclSpec);
8877      getEditLog().logSetAcl(src, newAcl);
8878      resultingStat = getAuditFileInfo(src, false);
8879    } catch (AccessControlException e) {
8880      logAuditEvent(false, "setAcl", srcArg);
8881      throw e;
8882    } finally {
8883      writeUnlock();
8884    }
8885    getEditLog().logSync();
8886    logAuditEvent(true, "setAcl", srcArg, null, resultingStat);
8887  }
8888
8889  AclStatus getAclStatus(String src) throws IOException {
8890    nnConf.checkAclsConfigFlag();
8891    FSPermissionChecker pc = getPermissionChecker();
8892    checkOperation(OperationCategory.READ);
8893    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
8894    boolean success = false;
8895    readLock();
8896    try {
8897      checkOperation(OperationCategory.READ);
8898      src = resolvePath(src, pathComponents);
8899      if (isPermissionEnabled) {
8900        checkPermission(pc, src, false, null, null, null, null);
8901      }
8902      final AclStatus ret = dir.getAclStatus(src);
8903      success = true;
8904      return ret;
8905    } finally {
8906      readUnlock();
8907      logAuditEvent(success, "getAclStatus", src);
8908    }
8909  }
8910
8911  /**
8912   * Create an encryption zone on directory src using the specified key.
8913   *
8914   * @param src     the path of a directory which will be the root of the
8915   *                encryption zone. The directory must be empty.
8916   * @param keyName name of a key which must be present in the configured
8917   *                KeyProvider.
8918   * @throws AccessControlException  if the caller is not the superuser.
8919   * @throws UnresolvedLinkException if the path can't be resolved.
8920   * @throws SafeModeException       if the Namenode is in safe mode.
8921   */
8922  void createEncryptionZone(final String src, final String keyName)
8923    throws IOException, UnresolvedLinkException,
8924      SafeModeException, AccessControlException {
8925    final CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
8926    if (cacheEntry != null && cacheEntry.isSuccess()) {
8927      return; // Return previous response
8928    }
8929
8930    boolean success = false;
8931    try {
8932      if (provider == null) {
8933        throw new IOException(
8934            "Can't create an encryption zone for " + src +
8935            " since no key provider is available.");
8936      }
8937      if (keyName == null || keyName.isEmpty()) {
8938        throw new IOException("Must specify a key name when creating an " +
8939            "encryption zone");
8940      }
8941      KeyProvider.Metadata metadata = provider.getMetadata(keyName);
8942      if (metadata == null) {
8943        /*
8944         * It would be nice if we threw something more specific than
8945         * IOException when the key is not found, but the KeyProvider API
8946         * doesn't provide for that. If that API is ever changed to throw
8947         * something more specific (e.g. UnknownKeyException) then we can
8948         * update this to match it, or better yet, just rethrow the
8949         * KeyProvider's exception.
8950         */
8951        throw new IOException("Key " + keyName + " doesn't exist.");
8952      }
8953      createEncryptionZoneInt(src, metadata.getCipher(),
8954          keyName, cacheEntry != null);
8955      success = true;
8956    } catch (AccessControlException e) {
8957      logAuditEvent(false, "createEncryptionZone", src);
8958      throw e;
8959    } finally {
8960      RetryCache.setState(cacheEntry, success);
8961    }
8962  }
8963
8964  private void createEncryptionZoneInt(final String srcArg, String cipher,
8965      String keyName, final boolean logRetryCache) throws IOException {
8966    String src = srcArg;
8967    HdfsFileStatus resultingStat = null;
8968    checkSuperuserPrivilege();
8969    final byte[][] pathComponents =
8970      FSDirectory.getPathComponentsForReservedPath(src);
8971    writeLock();
8972    try {
8973      checkSuperuserPrivilege();
8974      checkOperation(OperationCategory.WRITE);
8975      checkNameNodeSafeMode("Cannot create encryption zone on " + src);
8976      src = resolvePath(src, pathComponents);
8977
8978      final CipherSuite suite = CipherSuite.convert(cipher);
8979      // For now this is hardcoded, as we only support one method.
8980      final CryptoProtocolVersion version =
8981          CryptoProtocolVersion.ENCRYPTION_ZONES;
8982      final XAttr ezXAttr = dir.createEncryptionZone(src, suite,
8983          version, keyName);
8984      List<XAttr> xAttrs = Lists.newArrayListWithCapacity(1);
8985      xAttrs.add(ezXAttr);
8986      getEditLog().logSetXAttrs(src, xAttrs, logRetryCache);
8987      resultingStat = getAuditFileInfo(src, false);
8988    } finally {
8989      writeUnlock();
8990    }
8991    getEditLog().logSync();
8992    logAuditEvent(true, "createEncryptionZone", srcArg, null, resultingStat);
8993  }
8994
8995  /**
8996   * Get the encryption zone for the specified path.
8997   *
8998   * @param srcArg the path of a file or directory to get the EZ for.
8999   * @return the EZ of the of the path or null if none.
9000   * @throws AccessControlException  if the caller is not the superuser.
9001   * @throws UnresolvedLinkException if the path can't be resolved.
9002   */
9003  EncryptionZone getEZForPath(final String srcArg)
9004    throws AccessControlException, UnresolvedLinkException, IOException {
9005    String src = srcArg;
9006    HdfsFileStatus resultingStat = null;
9007    final byte[][] pathComponents =
9008        FSDirectory.getPathComponentsForReservedPath(src);
9009    boolean success = false;
9010    final FSPermissionChecker pc = getPermissionChecker();
9011    checkOperation(OperationCategory.READ);
9012    readLock();
9013    try {
9014      if (isPermissionEnabled) {
9015        checkPathAccess(pc, src, FsAction.READ);
9016      }
9017      checkOperation(OperationCategory.READ);
9018      src = resolvePath(src, pathComponents);
9019      final INodesInPath iip = dir.getINodesInPath(src, true);
9020      final EncryptionZone ret = dir.getEZForPath(iip);
9021      resultingStat = getAuditFileInfo(src, false);
9022      success = true;
9023      return ret;
9024    } finally {
9025      readUnlock();
9026      logAuditEvent(success, "getEZForPath", srcArg, null, resultingStat);
9027    }
9028  }
9029
9030  BatchedListEntries<EncryptionZone> listEncryptionZones(long prevId)
9031      throws IOException {
9032    boolean success = false;
9033    checkSuperuserPrivilege();
9034    checkOperation(OperationCategory.READ);
9035    readLock();
9036    try {
9037      checkSuperuserPrivilege();
9038      checkOperation(OperationCategory.READ);
9039      final BatchedListEntries<EncryptionZone> ret =
9040          dir.listEncryptionZones(prevId);
9041      success = true;
9042      return ret;
9043    } finally {
9044      readUnlock();
9045      logAuditEvent(success, "listEncryptionZones", null);
9046    }
9047  }
9048
9049  /**
9050   * Set xattr for a file or directory.
9051   * 
9052   * @param src
9053   *          - path on which it sets the xattr
9054   * @param xAttr
9055   *          - xAttr details to set
9056   * @param flag
9057   *          - xAttrs flags
9058   * @throws AccessControlException
9059   * @throws SafeModeException
9060   * @throws UnresolvedLinkException
9061   * @throws IOException
9062   */
9063  void setXAttr(String src, XAttr xAttr, EnumSet<XAttrSetFlag> flag)
9064      throws AccessControlException, SafeModeException,
9065      UnresolvedLinkException, IOException {
9066    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
9067    if (cacheEntry != null && cacheEntry.isSuccess()) {
9068      return; // Return previous response
9069    }
9070    boolean success = false;
9071    try {
9072      setXAttrInt(src, xAttr, flag, cacheEntry != null);
9073      success = true;
9074    } catch (AccessControlException e) {
9075      logAuditEvent(false, "setXAttr", src);
9076      throw e;
9077    } finally {
9078      RetryCache.setState(cacheEntry, success);
9079    }
9080  }
9081  
9082  private void setXAttrInt(final String srcArg, XAttr xAttr,
9083      EnumSet<XAttrSetFlag> flag, boolean logRetryCache) throws IOException {
9084    String src = srcArg;
9085    nnConf.checkXAttrsConfigFlag();
9086    checkXAttrSize(xAttr);
9087    HdfsFileStatus resultingStat = null;
9088    FSPermissionChecker pc = getPermissionChecker();
9089    XAttrPermissionFilter.checkPermissionForApi(pc, xAttr,
9090        FSDirectory.isReservedRawName(src));
9091    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
9092    writeLock();
9093    try {
9094      checkOperation(OperationCategory.WRITE);
9095      checkNameNodeSafeMode("Cannot set XAttr on " + src);
9096      src = resolvePath(src, pathComponents);
9097      checkXAttrChangeAccess(src, xAttr, pc);
9098      List<XAttr> xAttrs = Lists.newArrayListWithCapacity(1);
9099      xAttrs.add(xAttr);
9100      dir.setXAttrs(src, xAttrs, flag);
9101      getEditLog().logSetXAttrs(src, xAttrs, logRetryCache);
9102      resultingStat = getAuditFileInfo(src, false);
9103    } finally {
9104      writeUnlock();
9105    }
9106    getEditLog().logSync();
9107    logAuditEvent(true, "setXAttr", srcArg, null, resultingStat);
9108  }
9109
9110  /**
9111   * Verifies that the combined size of the name and value of an xattr is within
9112   * the configured limit. Setting a limit of zero disables this check.
9113   */
9114  private void checkXAttrSize(XAttr xAttr) {
9115    if (nnConf.xattrMaxSize == 0) {
9116      return;
9117    }
9118    int size = xAttr.getName().getBytes(Charsets.UTF_8).length;
9119    if (xAttr.getValue() != null) {
9120      size += xAttr.getValue().length;
9121    }
9122    if (size > nnConf.xattrMaxSize) {
9123      throw new HadoopIllegalArgumentException(
9124          "The XAttr is too big. The maximum combined size of the"
9125          + " name and value is " + nnConf.xattrMaxSize
9126          + ", but the total size is " + size);
9127    }
9128  }
9129  
9130  List<XAttr> getXAttrs(final String srcArg, List<XAttr> xAttrs)
9131      throws IOException {
9132    String src = srcArg;
9133    nnConf.checkXAttrsConfigFlag();
9134    FSPermissionChecker pc = getPermissionChecker();
9135    final boolean isRawPath = FSDirectory.isReservedRawName(src);
9136    boolean getAll = xAttrs == null || xAttrs.isEmpty();
9137    if (!getAll) {
9138      try {
9139        XAttrPermissionFilter.checkPermissionForApi(pc, xAttrs, isRawPath);
9140      } catch (AccessControlException e) {
9141        logAuditEvent(false, "getXAttrs", srcArg);
9142        throw e;
9143      }
9144    }
9145    checkOperation(OperationCategory.READ);
9146    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
9147    readLock();
9148    try {
9149      src = resolvePath(src, pathComponents);
9150      checkOperation(OperationCategory.READ);
9151      if (isPermissionEnabled) {
9152        checkPathAccess(pc, src, FsAction.READ);
9153      }
9154      List<XAttr> all = dir.getXAttrs(src);
9155      List<XAttr> filteredAll = XAttrPermissionFilter.
9156          filterXAttrsForApi(pc, all, isRawPath);
9157      if (getAll) {
9158        return filteredAll;
9159      } else {
9160        if (filteredAll == null || filteredAll.isEmpty()) {
9161          return null;
9162        }
9163        List<XAttr> toGet = Lists.newArrayListWithCapacity(xAttrs.size());
9164        for (XAttr xAttr : xAttrs) {
9165          boolean foundIt = false;
9166          for (XAttr a : filteredAll) {
9167            if (xAttr.getNameSpace() == a.getNameSpace()
9168                && xAttr.getName().equals(a.getName())) {
9169              toGet.add(a);
9170              foundIt = true;
9171              break;
9172            }
9173          }
9174          if (!foundIt) {
9175            throw new IOException(
9176                "At least one of the attributes provided was not found.");
9177        }
9178        }
9179        return toGet;
9180      }
9181    } catch (AccessControlException e) {
9182      logAuditEvent(false, "getXAttrs", srcArg);
9183      throw e;
9184    } finally {
9185      readUnlock();
9186    }
9187  }
9188
9189  List<XAttr> listXAttrs(String src) throws IOException {
9190    nnConf.checkXAttrsConfigFlag();
9191    final FSPermissionChecker pc = getPermissionChecker();
9192    final boolean isRawPath = FSDirectory.isReservedRawName(src);
9193    checkOperation(OperationCategory.READ);
9194    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
9195    readLock();
9196    try {
9197      src = resolvePath(src, pathComponents);
9198      checkOperation(OperationCategory.READ);
9199      if (isPermissionEnabled) {
9200        /* To access xattr names, you need EXECUTE in the owning directory. */
9201        checkParentAccess(pc, src, FsAction.EXECUTE);
9202      }
9203      final List<XAttr> all = dir.getXAttrs(src);
9204      final List<XAttr> filteredAll = XAttrPermissionFilter.
9205        filterXAttrsForApi(pc, all, isRawPath);
9206      return filteredAll;
9207    } catch (AccessControlException e) {
9208      logAuditEvent(false, "listXAttrs", src);
9209      throw e;
9210    } finally {
9211      readUnlock();
9212    }
9213  }
9214  
9215  /**
9216   * Remove an xattr for a file or directory.
9217   *
9218   * @param src
9219   *          - path to remove the xattr from
9220   * @param xAttr
9221   *          - xAttr to remove
9222   * @throws AccessControlException
9223   * @throws SafeModeException
9224   * @throws UnresolvedLinkException
9225   * @throws IOException
9226   */
9227  void removeXAttr(String src, XAttr xAttr) throws IOException {
9228    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
9229    if (cacheEntry != null && cacheEntry.isSuccess()) {
9230      return; // Return previous response
9231    }
9232    boolean success = false;
9233    try {
9234      removeXAttrInt(src, xAttr, cacheEntry != null);
9235      success = true;
9236    } catch (AccessControlException e) {
9237      logAuditEvent(false, "removeXAttr", src);
9238      throw e;
9239    } finally {
9240      RetryCache.setState(cacheEntry, success);
9241    }
9242  }
9243
9244  void removeXAttrInt(final String srcArg, XAttr xAttr, boolean logRetryCache)
9245      throws IOException {
9246    String src = srcArg;
9247    nnConf.checkXAttrsConfigFlag();
9248    HdfsFileStatus resultingStat = null;
9249    FSPermissionChecker pc = getPermissionChecker();
9250    XAttrPermissionFilter.checkPermissionForApi(pc, xAttr,
9251        FSDirectory.isReservedRawName(src));
9252    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
9253    writeLock();
9254    try {
9255      checkOperation(OperationCategory.WRITE);
9256      checkNameNodeSafeMode("Cannot remove XAttr entry on " + src);
9257      src = resolvePath(src, pathComponents);
9258      checkXAttrChangeAccess(src, xAttr, pc);
9259
9260      List<XAttr> xAttrs = Lists.newArrayListWithCapacity(1);
9261      xAttrs.add(xAttr);
9262      List<XAttr> removedXAttrs = dir.removeXAttrs(src, xAttrs);
9263      if (removedXAttrs != null && !removedXAttrs.isEmpty()) {
9264        getEditLog().logRemoveXAttrs(src, removedXAttrs, logRetryCache);
9265      } else {
9266        throw new IOException(
9267            "No matching attributes found for remove operation");
9268      }
9269      resultingStat = getAuditFileInfo(src, false);
9270    } finally {
9271      writeUnlock();
9272    }
9273    getEditLog().logSync();
9274    logAuditEvent(true, "removeXAttr", srcArg, null, resultingStat);
9275  }
9276
9277  private void checkXAttrChangeAccess(String src, XAttr xAttr,
9278      FSPermissionChecker pc) throws UnresolvedLinkException,
9279      AccessControlException {
9280    if (isPermissionEnabled && xAttr.getNameSpace() == XAttr.NameSpace.USER) {
9281      final INode inode = dir.getINode(src);
9282      if (inode != null &&
9283          inode.isDirectory() &&
9284          inode.getFsPermission().getStickyBit()) {
9285        if (!pc.isSuperUser()) {
9286          checkOwner(pc, src);
9287        }
9288      } else {
9289        checkPathAccess(pc, src, FsAction.WRITE);
9290      }
9291    }
9292  }
9293
9294  void checkAccess(String src, FsAction mode) throws AccessControlException,
9295      FileNotFoundException, UnresolvedLinkException, IOException {
9296    checkOperation(OperationCategory.READ);
9297    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
9298    readLock();
9299    try {
9300      checkOperation(OperationCategory.READ);
9301      src = FSDirectory.resolvePath(src, pathComponents, dir);
9302      if (dir.getINode(src) == null) {
9303        throw new FileNotFoundException("Path not found");
9304      }
9305      if (isPermissionEnabled) {
9306        FSPermissionChecker pc = getPermissionChecker();
9307        checkPathAccess(pc, src, mode);
9308      }
9309    } catch (AccessControlException e) {
9310      logAuditEvent(false, "checkAccess", src);
9311      throw e;
9312    } finally {
9313      readUnlock();
9314    }
9315  }
9316
9317  /**
9318   * Default AuditLogger implementation; used when no access logger is
9319   * defined in the config file. It can also be explicitly listed in the
9320   * config file.
9321   */
9322  private static class DefaultAuditLogger extends HdfsAuditLogger {
9323
9324    private boolean logTokenTrackingId;
9325
9326    @Override
9327    public void initialize(Configuration conf) {
9328      logTokenTrackingId = conf.getBoolean(
9329          DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY,
9330          DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT);
9331    }
9332
9333    @Override
9334    public void logAuditEvent(boolean succeeded, String userName,
9335        InetAddress addr, String cmd, String src, String dst,
9336        FileStatus status, UserGroupInformation ugi,
9337        DelegationTokenSecretManager dtSecretManager) {
9338      if (auditLog.isInfoEnabled()) {
9339        final StringBuilder sb = auditBuffer.get();
9340        sb.setLength(0);
9341        sb.append("allowed=").append(succeeded).append("\t");
9342        sb.append("ugi=").append(userName).append("\t");
9343        sb.append("ip=").append(addr).append("\t");
9344        sb.append("cmd=").append(cmd).append("\t");
9345        sb.append("src=").append(src).append("\t");
9346        sb.append("dst=").append(dst).append("\t");
9347        if (null == status) {
9348          sb.append("perm=null");
9349        } else {
9350          sb.append("perm=");
9351          sb.append(status.getOwner()).append(":");
9352          sb.append(status.getGroup()).append(":");
9353          sb.append(status.getPermission());
9354        }
9355        if (logTokenTrackingId) {
9356          sb.append("\t").append("trackingId=");
9357          String trackingId = null;
9358          if (ugi != null && dtSecretManager != null
9359              && ugi.getAuthenticationMethod() == AuthenticationMethod.TOKEN) {
9360            for (TokenIdentifier tid: ugi.getTokenIdentifiers()) {
9361              if (tid instanceof DelegationTokenIdentifier) {
9362                DelegationTokenIdentifier dtid =
9363                    (DelegationTokenIdentifier)tid;
9364                trackingId = dtSecretManager.getTokenTrackingId(dtid);
9365                break;
9366              }
9367            }
9368          }
9369          sb.append(trackingId);
9370        }
9371        sb.append("\t").append("proto=");
9372        sb.append(NamenodeWebHdfsMethods.isWebHdfsInvocation() ? "webhdfs" : "rpc");
9373        logAuditMessage(sb.toString());
9374      }
9375    }
9376
9377    public void logAuditMessage(String message) {
9378      auditLog.info(message);
9379    }
9380  }
9381
9382  private static void enableAsyncAuditLog() {
9383    if (!(auditLog instanceof Log4JLogger)) {
9384      LOG.warn("Log4j is required to enable async auditlog");
9385      return;
9386    }
9387    Logger logger = ((Log4JLogger)auditLog).getLogger();
9388    @SuppressWarnings("unchecked")
9389    List<Appender> appenders = Collections.list(logger.getAllAppenders());
9390    // failsafe against trying to async it more than once
9391    if (!appenders.isEmpty() && !(appenders.get(0) instanceof AsyncAppender)) {
9392      AsyncAppender asyncAppender = new AsyncAppender();
9393      // change logger to have an async appender containing all the
9394      // previously configured appenders
9395      for (Appender appender : appenders) {
9396        logger.removeAppender(appender);
9397        asyncAppender.addAppender(appender);
9398      }
9399      logger.addAppender(asyncAppender);        
9400    }
9401  }
9402}
9403