Source code for msgvis.apps.corpus.models

import operator
from django.db import models
from django.db.models import Q
from caching.base import CachingManager, CachingMixin

from msgvis.apps.base import models as base_models
from msgvis.apps.corpus import utils

import re

import os
from msgvis.settings.common import DEBUG


[docs]class Dataset(models.Model):
    """A top-level dataset object containing messages."""

    name = models.CharField(max_length=150)
    """The name of the dataset"""

    description = models.TextField()
    """A description of the dataset."""

    created_at = models.DateTimeField(auto_now_add=True)
    """The :py:class:`datetime.datetime` when the dataset was created."""

    start_time = models.DateTimeField(null=True, default=None, blank=True)
    """The time of the first real message in the dataset"""

    end_time = models.DateTimeField(null=True, default=None, blank=True)
    """The time of the last real message in the dataset"""

    has_prefetched_images = models.BooleanField(default=False)

    @property
    def message_count(self):
        return self.message_set.count()

    def __unicode__(self):
        return self.name

[docs]    def get_example_messages(self, filters=[], excludes=[]):
        """Get example messages given some filters (dictionaries containing dimensions and filter params)"""

        messages = self.message_set.all()

        for filter in filters:
            dimension = filter["dimension"]

            # Remove the dimension key
            params = {key: value for key, value in filter.iteritems() if key != "dimension"}

            messages = dimension.filter(messages, **params)

        for exclude in excludes:
            dimension = exclude["dimension"]

            # Remove the dimension key
            params = {key: value for key, value in excludes.iteritems() if key != "dimension"}

            messages = dimension.exclude(messages, **params)

        return messages

    def get_example_messages_by_groups(self, groups, filters=[], excludes=[]):
        include_groups = map(lambda x: int(x['value']), filter(lambda x: x['dimension'].key=='groups', filters))
        if len(include_groups)> 0:
            groups = include_groups
        exclude_groups = map(lambda x: int(x['value']), filter(lambda x: x['dimension'].key=='groups', excludes))
        groups = filter(lambda x: x not in exclude_groups, groups)

        per_group = int(10 / len(groups))
        combined_messages = []
        group_querysets = []
        for group in groups:
            group_obj = self.groups.get(id=group)
            messages = group_obj.messages
            for filterA in filters:
                dimension = filterA["dimension"]

                # Remove the dimension key
                params = {key: value for key, value in filterA.iteritems() if key != "dimension"}
                messages = dimension.filter(messages, **params)

            for exclude in excludes:
                dimension = exclude["dimension"]

                # Remove the dimension key
                params = {key: value for key, value in excludes.iteritems() if key != "dimension"}

                messages = dimension.exclude(messages, **params)

            group_querysets.append(messages)
            #combined_messages.extend(messages[:per_group])
        query = ""
        for idx, queryset in enumerate(group_querysets):
            if idx > 0:
                query += " UNION "
            query += "(%s)" %(utils.quote(str(queryset.query)))
        query = utils.convert_boolean(query)
        queryset = Message.objects.raw(query)
        return queryset

    def get_dictionary(self):
        dictionary = self.dictionary.all()
        if len(dictionary) > 0:
            dictionary = dictionary[0]
            return dictionary
        return None

    def get_advanced_search_results(self, keywords_text, include_types):

        clauses = keywords_text.split(',')
        inclusive_keywords = []
        exclusive_keywords = []
        queryset = self.tweet_words.all()
        message_queryset = self.message_set.all()
        if (len(include_types) > 0):
            message_queryset = message_queryset.filter(utils.levels_or('type__name', map(lambda x: x.name, include_types)))
        final_queryset = self.message_set.none()
        for clause in clauses:
            if clause.startswith("NOT "):
                words = clause[4:].split(' ')
                word_list = utils.get_word_objs(queryset=queryset, text_field_name='original_text', related_field_name="tweet_words__id", words=words)
                if len(word_list) > 0:
                    # TODO: makes this real AND
                    #and_word_list = reduce(operator.or_, word_list)
                    exclusive_keywords.extend(word_list)

            else:
                words = clause.split(' ')
                word_list = utils.get_word_objs(queryset=queryset, text_field_name='original_text', related_field_name="tweet_words__id", words=words)
                if len(word_list) > 0:
                    #and_word_list = reduce(operator.and_, word_list)
                    #inclusive_keywords.append(and_word_list)
                    clause_queryset = message_queryset
                    for or_word_list in word_list:
                        clause_queryset = clause_queryset.filter(or_word_list)


                    final_queryset |= clause_queryset


        queryset = final_queryset


        #if len(inclusive_keywords) > 0:
        #    inclusive_keywords = reduce(operator.or_, inclusive_keywords)
        #    queryset = queryset.filter(inclusive_keywords)

        if len(exclusive_keywords) > 0:
            for word in exclusive_keywords:
                queryset = queryset.exclude(word)

        return queryset.distinct()

    def get_precalc_distribution(self, dimension, search_key=None, page=None, page_size=100, mode=None):
        dimension_key = dimension.key
        distribution = self.distributions.filter(dimension_key=dimension_key)
        if search_key is not None:
            distribution = distribution.filter(level__icontains=search_key)
        distribution = distribution.order_by('-count')
        total_num_levels = distribution.count()
        if page is not None:
            start = (page - 1) * page_size
            end = min(start + page_size, total_num_levels)
            max_page = (total_num_levels / page_size) + 1

            # no level left
            if total_num_levels == 0 or start > total_num_levels:
                return None

            distribution = distribution[start:end]

        else:
            if mode == "omit_others" or mode == "enable_others":
                MAX_CATEGORICAL_LEVELS = 10
                distribution = distribution[:MAX_CATEGORICAL_LEVELS]
            else:
                distribution = distribution.all()

        domains = {}
        domain_labels = {}

        domain = map(lambda x: x.level, distribution)
        labels = dimension.get_domain_labels(domain)

        domains[dimension_key] = domain
        domain_labels[dimension_key] = labels

        table = map(lambda x: {dimension_key: x.level, "value": x.count}, distribution)

        results = {
            "table": table,
            "domains": domains,
            "domain_labels": domain_labels
        }

        return results




[docs]class MessageType(models.Model):
    """The type of a message, e.g. retweet, reply, original, system..."""

    name = models.CharField(max_length=100, unique=True)
    """The name of the message type"""

    def __unicode__(self):
        return self.name


[docs]class Language(CachingMixin, models.Model):
    """Represents the language of a message or a user"""

    code = models.SlugField(max_length=10, unique=True)
    """A short language code like 'en'"""

    name = models.CharField(max_length=100)
    """The full name of the language"""

    objects = CachingManager()

    def __unicode__(self):
        return "%s:%s" % (self.code, self.name)


[docs]class Url(models.Model):
    """A url from a message"""

    domain = models.CharField(max_length=100, db_index=True)
    """The root domain of the url"""

    short_url = models.CharField(max_length=250, blank=True)
    """A shortened url"""

    full_url = models.TextField()
    """The full url"""


[docs]class Hashtag(models.Model):
    """A hashtag in a message"""

    text = base_models.Utf8CharField(max_length=100, db_index=True)
    """The text of the hashtag, without the hash"""


[docs]class Media(models.Model):
    """
    Linked media, e.g. photos or videos.
    """

    type = models.CharField(max_length=50)
    """The kind of media this is."""

    media_url = models.CharField(max_length=250)
    """A url where the media may be accessed"""


[docs]class Timezone(CachingMixin, models.Model):
    """
    The timezone of a message or user
    """

    olson_code = models.CharField(max_length=40, null=True, blank=True, default=None)
    """The timezone code from pytz."""

    name = models.CharField(max_length=150, db_index=True)
    """Another name for the timezone, perhaps the country where it is located?"""

    objects = CachingManager()


[docs]class Person(models.Model):
    """
    A person who sends messages in a dataset.
    """

    class Meta:
        index_together = (
            ('dataset', 'original_id')  # used by the importer
        )

    dataset = models.ForeignKey(Dataset)
    """Which :class:`Dataset` this person belongs to"""

    original_id = models.BigIntegerField(null=True, blank=True, default=None)
    """An external id for the person, e.g. a user id from Twitter"""

    username = base_models.Utf8CharField(max_length=150, null=True, blank=True, default=None)
    """Username is a short system-y name."""

    full_name = base_models.Utf8CharField(max_length=250, null=True, blank=True, default=None)
    """Full name is a longer user-friendly name"""

    language = models.ForeignKey(Language, null=True, blank=True, default=None)
    """The person's primary :class:`Language`"""

    message_count = models.PositiveIntegerField(blank=True, default=0)
    """The number of messages the person produced"""

    replied_to_count = models.PositiveIntegerField(blank=True, default=0)
    """The number of times the person's messages were replied to"""

    shared_count = models.PositiveIntegerField(blank=True, default=0)
    """The number of times the person's messages were shared or retweeted"""

    mentioned_count = models.PositiveIntegerField(blank=True, default=0)
    """The number of times the person was mentioned in other people's messages"""

    friend_count = models.PositiveIntegerField(blank=True, default=0)
    """The number of people this user has connected to"""

    follower_count = models.PositiveIntegerField(blank=True, default=0)
    """The number of people who have connected to this person"""

    profile_image_url = models.TextField(null=True, blank=True, default="")
    """The person's profile image url"""

    def __unicode__(self):
        return self.username

    @property
    def profile_image_processed_url(self):
        url = self.profile_image_url
        if url != "" and self.dataset.has_prefetched_images:
            pattern = re.compile('/[_\.\-\w\d]+\.([\w]+)$')
            results = pattern.search(url)
            if results:
                suffix = results.groups()[0]
                url = "profile_" + str(self.original_id) + "." + suffix

        return url
        


[docs]class Message(models.Model):
    """
    The Message is the central data entity for the dataset.
    """
    class Meta:
        index_together = (
            ('dataset', 'original_id'),  # used by importer
            ('dataset', 'time'),
        )
            
    dataset = models.ForeignKey(Dataset)
    """Which :class:`Dataset` the message belongs to"""

    original_id = models.BigIntegerField(null=True, blank=True, default=None)
    """An external id for the message, e.g. a tweet id from Twitter"""

    type = models.ForeignKey(MessageType, null=True, blank=True, default=None)
    """The :class:`MessageType` Message type: retweet, reply, origin..."""

    sender = models.ForeignKey(Person, null=True, blank=True, default=None)
    """The :class:`Person` who sent the message"""

    time = models.DateTimeField(null=True, blank=True, default=None)
    """The :py:class:`datetime.datetime` (in UTC) when the message was sent"""

    language = models.ForeignKey(Language, null=True, blank=True, default=None)
    """The :class:`Language` of the message."""

    SENTIMENT_POSITIVE = 1
    SENTIMENT_NEUTRAL  = 0
    SENTIMENT_NEGATIVE = -1
    SENTIMENT_CHOICES = (
        (SENTIMENT_POSITIVE, "positive"),
        (SENTIMENT_NEUTRAL,  "neutral"),
        (SENTIMENT_NEGATIVE, "negative")
    )

    sentiment = models.SmallIntegerField(choices=SENTIMENT_CHOICES, null=True, blank=True, default=None)
    """The sentiment label for message."""

    timezone = models.ForeignKey(Timezone, null=True, blank=True, default=None)
    """The :class:`Timezone` of the message."""

    replied_to_count = models.PositiveIntegerField(blank=True, default=0)
    """The number of replies this message received."""

    shared_count = models.PositiveIntegerField(blank=True, default=0)
    """The number of times this message was shared or retweeted."""

    contains_hashtag = models.BooleanField(blank=True, default=False)
    """True if the message has a :class:`Hashtag`."""

    contains_url = models.BooleanField(blank=True, default=False)
    """True if the message has a :class:`Url`."""

    contains_media = models.BooleanField(blank=True, default=False)
    """True if the message has any :class:`Media`."""

    contains_mention = models.BooleanField(blank=True, default=False)
    """True if the message mentions any :class:`Person`."""

    urls = models.ManyToManyField(Url, null=True, blank=True, default=None)
    """The set of :class:`Url` in the message."""

    hashtags = models.ManyToManyField(Hashtag, null=True, blank=True, default=None)
    """The set of :class:`Hashtag` in the message."""

    media = models.ManyToManyField(Media, null=True, blank=True, default=None)
    """The set of :class:`Media` in the message."""

    mentions = models.ManyToManyField(Person, related_name="mentioned_in", null=True, blank=True, default=None)
    """The set of :class:`Person` mentioned in the message."""

    text = base_models.Utf8TextField(null=True, blank=True, default="")
    """The actual text of the message."""

    @property
    def embedded_html(self):
        #return utils.get_embedded_html(self.original_id)
        return utils.render_html_tag(self.text)

    @property
    def media_url(self):
        url = ""
        if self.contains_media:
            url = self.media.all()[0].media_url
            if self.dataset.has_prefetched_images:
                pattern = re.compile('/([_\.\-\w\d]+\.[\w]+)$')
                results = pattern.search(url)
                if results:
                    url = results.groups()[0]
        return url


    def __repr__(self):
        return str(self.time) + " || " + self.text

    def __unicode__(self):
        return self.__repr__()