cyliu
论坛版主
论坛版主
  • 注册日期2003-06-13
  • 最后登录2014-04-11
  • 粉丝5
  • 关注0
  • 积分1238分
  • 威望2531点
  • 贡献值0点
  • 好评度577点
  • 原创分14分
  • 专家分10分
阅读:3286回复:5

Linux 2.6.17.9内核文件系统调用详解

楼主#
更多 发布于:2007-05-11 10:31
Linux 2.6.17.9内核文件系统调用详解

本部分主要讲述的是文件I/O操作的2.6.17.9内核版本实现,包括了主要的数据结构、宏定义和函数流程。以下分别讲述open,create,close,read,write,lseek系统调用。

1 重要数据结构

1.1 struct file

struct file {
    /*
     * fu_list becomes invalid after file_free is called and queued via
     * fu_rcuhead for RCU freeing
     */
    union {
        struct list_head    fu_list; //文件链表指针
        struct rcu_head     fu_rcuhead; //rcu链表
    } f_u;
    struct dentry        *f_dentry; // 文件对应的目录结构
    struct vfsmount         *f_vfsmnt; // 虚拟文件系统挂载点
    const struct file_operations    *f_op; // 文件操作函数指针
    atomic_t        f_count; // 引用计数
    unsigned int         f_flags;
    mode_t            f_mode; // 文件模式
    loff_t            f_pos; // 文件offset
    struct fown_struct    f_owner; //文件owner 结构
    unsigned int        f_uid, f_gid;//文件用户id,组id
    struct file_ra_state    f_ra; // 跟踪上次文件操作状态的结构指针

    unsigned long        f_version;
    void            *f_security; // hook 文件操作的security结构指针

    /* needed for tty driver, and maybe others */
    void            *private_data; // tty 驱动器所需数据

#ifdef CONFIG_EPOLL
    /* Used by fs/eventpoll.c to link all the hooks to this file */
    struct list_head    f_ep_links; // EPOLL 机制检测所需链表结构
    spinlock_t        f_ep_lock; // 兼容早期gcc bug 的标志
#endif /* #ifdef CONFIG_EPOLL */
    struct address_space    *f_mapping; // 地址映射表
};

1.2 struct fown_struct
struct fown_struct {
    rwlock_t lock;          /* protects pid, uid, euid fields */
    int pid;        /* pid or -pgrp where SIGIO should be sent */
    uid_t uid, euid;    /* uid/euid of process setting the owner */
    void *security; /*hook 文件操作的security结构指针*/
    int signum;        /* posix.1b rt signal to be delivered on IO */
};

1.3 struct file_ra_state

/*
 * Track a single file's readahead state
 */
struct file_ra_state {
    unsigned long start;        /* Current window */
    unsigned long size;
    unsigned long flags;        /* ra flags RA_FLAG_xxx*/
    unsigned long cache_hit;    /* cache hit count*/
    unsigned long prev_page;    /* Cache last read() position */
    unsigned long ahead_start;    /* Ahead window */
    unsigned long ahead_size;
    unsigned long ra_pages;        /* Maximum readahead window */
    unsigned long mmap_hit;        /* Cache hit stat for mmap accesses */
    unsigned long mmap_miss;    /* Cache miss stat for mmap accesses */
};

1.4 struct address_space

struct address_space {
    struct inode        *host;        /* owner: inode, block_device */
    struct radix_tree_root    page_tree;    /* radix tree of all pages */
    rwlock_t        tree_lock;    /* and rwlock protecting it */
    unsigned int        i_mmap_writable;/* count VM_SHARED mappings */
    struct prio_tree_root    i_mmap;        /* tree of private and shared mappings */
    struct list_head    i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
    spinlock_t        i_mmap_lock;    /* protect tree, count, list */
    unsigned int        truncate_count;    /* Cover race condition with truncate */
    unsigned long        nrpages;    /* number of total pages */
    pgoff_t            writeback_index;/* writeback starts here */
    struct address_space_operations *a_ops;    /* methods */
    unsigned long        flags;        /* error bits/gfp mask */
    struct backing_dev_info *backing_dev_info; /* device readahead, etc */
    spinlock_t        private_lock;    /* for use by the address_space */
    struct list_head    private_list;    /* ditto */
    struct address_space    *assoc_mapping;    /* ditto */
} __attribute__((aligned(sizeof(long))));

struct address_space_operations {
    int (*writepage)(struct page *page, struct writeback_control *wbc);
    int (*readpage)(struct file *, struct page *);
    void (*sync_page)(struct page *);

    /* Write back some dirty pages from this mapping. */
    int (*writepages)(struct address_space *, struct writeback_control *);

    /* Set a page dirty.  Return true if this dirtied it */
    int (*set_page_dirty)(struct page *page);

    int (*readpages)(struct file *filp, struct address_space *mapping,
            struct list_head *pages, unsigned nr_pages);

    /*
     * ext3 requires that a successful prepare_write() call be followed
     * by a commit_write() call - they must be balanced
     */
    int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
    int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
    /* Unfortunately this kludge is needed for FIBMAP. Don't use it */
    sector_t (*bmap)(struct address_space *, sector_t);
    void (*invalidatepage) (struct page *, unsigned long);
    int (*releasepage) (struct page *, gfp_t);
    ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
            loff_t offset, unsigned long nr_segs);
    struct page* (*get_xip_page)(struct address_space *, sector_t,
            int);
    /* migrate the contents of a page to the specified target */
    int (*migratepage) (struct page *, struct page *);
};

1.5 struct block_device

struct block_device {
    dev_t            bd_dev;  /* not a kdev_t - it's a search key */
    struct inode *        bd_inode;    /* will die */
    int            bd_openers;
    struct mutex        bd_mutex;    /* open/close mutex */
    struct mutex        bd_mount_mutex;    /* mount mutex */
    struct list_head    bd_inodes;
    void *            bd_holder;
    int            bd_holders;
#ifdef CONFIG_SYSFS
    struct list_head    bd_holder_list;
#endif
    struct block_device *    bd_contains;
    unsigned        bd_block_size;
    struct hd_struct *    bd_part;
    /* number of times partitions within this device have been opened. */
    unsigned        bd_part_count;
    int            bd_invalidated;
    struct gendisk *    bd_disk;
    struct list_head    bd_list;
    struct backing_dev_info *bd_inode_backing_dev_info;
    /*
     * Private data.  You must have bd_claim'ed the block_device
     * to use this.  NOTE:  bd_claim allows an owner to claim
     * the same device multiple times, the owner must take special
     * care to not mess up bd_private for that case.
     */
    unsigned long        bd_private;
};

1.6 struct backing_dev_info

struct backing_dev_info {
    unsigned long ra_pages;    /* max readahead in PAGE_CACHE_SIZE units */
    unsigned long state;    /* Always use atomic bitops on this */
    unsigned int capabilities; /* Device capabilities */
    congested_fn *congested_fn; /* Function pointer if device is md/dm */
    void *congested_data;    /* Pointer to aux data for congested func */
    void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
    void *unplug_io_data;
};

1.7 struct files_struct

对于内核而言,所有打开文件都由文件描述符引用。文件描述符是一个非负整数。当打开一个现存文件或创建一个新文件时,内核向进程返回一个文件描述符。

当读、写一个文件时,用open或creat返回的文件描述符标识该文件,将其作为参数传送给read或write。在POSIX.1应用程序中,文件描述符为常数0、1和2分别代表STDIN_FILENO、STDOUT_FILENO和STDERR_FILENO,意即标准输入,标准输出和标准出错输出,这些常数都定义在头文件;中。

文件描述符的范围是0~OPEN_MAX,在目前常用的linux系统中,是32位整形所能表示的整数,即65535,64位机上则更多。
/*
 * Open file table structure
 */
struct files_struct {
  /*
   * read mostly part
   */
    atomic_t count; /* 引用计数 */
    struct fdtable *fdt; /* 文件表指针,指向fdtab */
    struct fdtable fdtab;/* 文件表 */
  /*
   * written part on a separate cache line in SMP
   */
    spinlock_t file_lock ____cacheline_aligned_in_smp;
    int next_fd; // 下一个空闲fd
    struct embedded_fd_set close_on_exec_init; /* 可执行close的fd集合 */
    struct embedded_fd_set open_fds_init;/* 打开的fd集合 */
    struct file * fd_array[NR_OPEN_DEFAULT]; /*打开的文件列表*/
};

struct fdtable {
    unsigned int max_fds; // 最大文件句柄数目
    int max_fdset; // 最大的fd集合容量
    struct file ** fd;      /* current fd array */
    fd_set *close_on_exec; // 可执行close的fd集合
    fd_set *open_fds; // 打开的fd集合
    struct rcu_head rcu;
    struct files_struct *free_files; /*反向指针 */
    struct fdtable *next; /*链表*/
};

2 文件操作

2.1 open 操作

2.1.1 调用关系

sys_open
| ----------- getname
| ----------- filp_open
| | ------------ open_namei
| | | ----------- may_open
| | ------------ dentry_open
走走看看开源好 Solaris vs Linux
cyliu
论坛版主
论坛版主
  • 注册日期2003-06-13
  • 最后登录2014-04-11
  • 粉丝5
  • 关注0
  • 积分1238分
  • 威望2531点
  • 贡献值0点
  • 好评度577点
  • 原创分14分
  • 专家分10分
沙发#
发布于:2007-05-15 12:03
感觉英文注视已经很好了,所以就偷懒了,呵呵
走走看看开源好 Solaris vs Linux
游客

返回顶部